ARG-MOB R analyses and plotting

Disclaimer: This document is intended for documentation of the analyses for the publication Mobilization of antibiotic resistance genes differ by resistance mechanism.

It is not intended to be published as a method that can be routinely applied by other people, but only serves to document how analyses have been performed. It is derived from the development version of the analyses script and is, as such, not super polished but only functions to document the analyses and tests that are included in the final article. Figures do not appear in the same order as they do in the article and may appear in a slightly different size, format and color, due to the fact that the figures in the article were not produced from this Rmarkdown report. However, all figures have been checked that they show the same data and that the interpretation and conclusions from them are the same in the article and in this report.

Load packages and define wd

knitr::opts_chunk$set(echo = TRUE)
library(ggplot2)
library(dplyr)
library(tidyr)
library(gridExtra)
library(ggpubr)
library(ggExtra)
library(reshape2)
library(knitr)
library(kableExtra)
library(vegan)
library(PerformanceAnalytics)
library(ComplexHeatmap)
library(RColorBrewer)
library(DT)
library(plotly)
## Warning: package 'plotly' was built under R version 3.6.2
library(rstatix)
## Warning: package 'rstatix' was built under R version 3.6.2
library(tidyverse)
library(broom)


knitr::opts_chunk$set(tidy.opts=list(width.cutoff=50),tidy=TRUE, message=FALSE)
#setwd("/Users/tkq300/Desktop/mac_data/ProxARG/early_sim_fil/")
setwd("/Users/tkq300/Desktop/mac_data/ProxARG/final_analysis4/")
knitr::opts_knit$set(root.dir = "/Users/tkq300/Desktop/mac_data/ProxARG/final_analysis4/")

#Fixed color scale for mechanisms
colours = c("Antibiotic efflux" = "#A6CEE3", "Antibiotic inactivation" = "#1F78B4", "Antibiotic target alteration" = "#B2DF8A", "Antibiotic target protection" = "#33A02C", "Antiobiotic target replacement" = "#FB9A99")

Lengths of transposons from the Transposon Registry (Tansirichaiya et al., 2019)

# Import and reformat data
tnp_reg = read.csv("tnp_registry_supp2.csv", sep = ";", 
    dec = ",")
tnp_reg$Length = gsub(",", ".", tnp_reg$Length)
tnp_reg$Length = as.numeric(as.character(tnp_reg$Length))

tnp_reg2 = subset(tnp_reg, Type == "Unit transposon" | 
    Type == "Composite transposon")

p = ggplot(tnp_reg2, aes(Type, Length, color = Type)) + 
    geom_violin() + geom_jitter() + scale_y_continuous(breaks = seq(0, 
    80, 5)) + geom_hline(yintercept = 12.17) + theme_light() + 
    theme(legend.position = "none") + geom_hline(yintercept = 24.34, 
    linetype = "dashed") + ylab("Length (kbp)")

ggExtra::ggMarginal(p, type = "histogram", margins = "y")

# At 10 kbp, what percentage of composite and unit
# transposons do we include?
tnp_reg3 = subset(tnp_reg2, Length <= 24.34)
nrow(tnp_reg3)/nrow(tnp_reg2) * 100
## [1] 77.72829
# What is the mean?
mean(tnp_reg2$Length, na.rm = T)
## [1] 12.1732
# 12.17 kbp
median(tnp_reg2$Length, na.rm = T)
## [1] 8.2

The distance between a usually unmobilized housekeeping gene, 16S rRNA, and closest IS element

Since the 16S rRNA gene should only extremely rarely be mobilized (and fixated) by IS elements, we can use the 16S:IS distance to validate that we are searching within a reasonable distance of ARG:IS associations

# Analyse 16S distance results
dist_16S = read.csv("16S_IS_distance", sep = " ", header = F)

colnames(dist_16S) = c("Region", "IS_Start", "IS_End", 
    "IS_element", "16S_Start", "16S_end", "Distance")
# A few IS elements apparently overlap with 16S,
# resulting in negative distances. Remove these.
dist_16S = dist_16S[which(dist_16S$Distance > 1), ]

dist_16S_only = as.data.frame(dist_16S$Distance)

# For all those 16S regions without IS elements
# within 100kb in either direction, add them as
# 100kb (more than)
long_dist16 = as.data.frame(rep(1e+05, 39174))
colnames(long_dist16) = "Distance"
colnames(dist_16S_only) = "Distance"
dist_16S_with_100kb = rbind(dist_16S_only, long_dist16)


# Ignore hits that do not have IS within 100 kbp
dist_16S_only$Gene = "16S"

ggplot(dist_16S_only, aes(Gene, Distance)) + geom_violin() + 
    theme_light() + geom_hline(yintercept = 12170) + 
    labs(y = "Distance to IS element (bp)", x = "")

# How many 16S genes have IS elements within 12.17
# kbp?
nrow(subset(dist_16S_only, Distance <= 12170))
## [1] 4480
# If you include those that do not have IS elements
# within 100k, the numbers are How many 16S genes
# have IS elements within 12.17 kbp?
nrow(subset(dist_16S_with_100kb, Distance <= 12170))
## [1] 4480
# And in percentage
nrow(subset(dist_16S_with_100kb, Distance <= 12170))/nrow(dist_16S_with_100kb) * 
    100
## [1] 5.590147

Investigate database biases

# Investigate database bias in taxonomy composition
# before identifying ARGs and clustering
p_tax = read.csv("plasmid_tax", header = F, sep = "\t")
colnames(p_tax) = c("Acc.", "Superkingdom", "Kingdom", 
    "Phylum", "Class", "Order", "Family", "Genus")

c_tax = read.csv("chr_tax", header = F, sep = "\t")
colnames(c_tax) = c("Acc.", "Superkingdom", "Kingdom", 
    "Phylum", "Class", "Order", "Family", "Genus")

c_tax$type = as.character("Chr")
p_tax$type = as.character("Plasmid")
both_tax = rbind(c_tax, p_tax)

# Calculate frequencies of groups and remove those
# making up less than 0.5% of total data
both_tax2 = both_tax %>% group_by(Class, Order, type) %>% 
    tally() %>% mutate(freq = (n/sum(n)) * 100)

both_tax2 = both_tax2[which(both_tax2$freq > 0.5), 
    ]

# The below figure (Supplemental figure S7) does
# not format neatly in this Rmarkdown report. The
# code below has been commented out but was used to
# produce figure S7 ggplot(both_tax2, aes(x =
# Order, y = freq, fill = type)) + geom_bar(stat =
# 'identity',position = 'dodge') + ylab('Relative
# frequencies') + facet_wrap(~ Class, nrow = 2,
# drop = T, scales = 'free_x') + theme_light() +
# theme(axis.text.x = element_text(angle = 45,
# hjust = 1), strip.text.x = element_text(angle =
# 45))

# CARD db taxonomy was found with the below command
# in linux terminal: fasta_formatter -i
# protein_fasta_protein_homolog_model.fasta -t |
# grep -o '\[.*\]' | awk '{print $1}' | sed -e
# 's/\[//' -e 's/\]//' | sort | uniq -c | sed -e
# 's/^ *//' > protein_homolog.tax
card_tax = read.csv("protein_homolog.tax", header = F, 
    sep = " ")
colnames(card_tax) = c("n", "Tax")
card_tax$freq = card_tax$n/sum(card_tax$n) * 100
card_tax$db = "CARD"
card_tax2 = card_tax[which(card_tax$freq > 0.5), ]

# Import taxonomy from refseq complete genomes.
# Since multiple replicon may occur per strain
# entry in database, only the first and largest
# chromosome was selected here (selected with bash
# functions in UNIX)
refseq_tax = read.csv("refseq_1st_chr.tax", header = F, 
    sep = "\t")
colnames(refseq_tax) = c("Acc.", "TaxID", "Superkingdom", 
    "Kingdom", "Phylum", "Class", "Order", "Family", 
    "Genus")

refseq_tax_freq = as.data.frame(refseq_tax %>% count(Phylum, 
    Class, Order, Family, Genus) %>% mutate(freq = (n/sum(n)) * 
    100))


refseq_tax_freq = refseq_tax_freq %>% group_by(Order) %>% 
    mutate(order_freq = sum(freq)) %>% filter(order_freq > 
    0.5)


ggplot(refseq_tax_freq, aes(Order, order_freq)) + geom_bar(stat = "identity", 
    position = "dodge") + facet_wrap(~Class, scales = "free_x", 
    nrow = 2) + theme_light() + theme(axis.text.x = element_text(angle = 90, 
    hjust = 1)) + labs(y = "Relative frequency") + 
    scale_fill_brewer(palette = "Paired")

refseq_tax2 = as.data.frame(refseq_tax %>% group_by(Genus) %>% 
    tally() %>% mutate(freq = (n/sum(n)) * 100))
refseq_tax2$db = "RefSeq"

colnames(refseq_tax2) = c("Tax", "n", "freq", "db")
refseq_tax2 = refseq_tax2[, c(2, 1, 3, 4)]

# Merge CARD and RefSeq taxonomies for comparison
refseq_tax2_card = refseq_tax2[refseq_tax2$Tax %in% 
    card_tax2$Tax, ]
card_refseq_tax_sub = rbind(refseq_tax2_card, card_tax2)

# Plot abundance frequencies for CARD and RefSeq
# databases. Figure not used in main article or
# supplemental information
ggplot(card_refseq_tax_sub, aes(x = Tax, y = freq, 
    fill = db)) + geom_bar(stat = "identity", position = "dodge") + 
    ylab("Relative frequencies") + theme_light() + 
    theme(axis.text.x = element_text(angle = 45, hjust = 1), 
        strip.text.x = element_text(angle = 45))

refseq_tax2_card2 = refseq_tax2[refseq_tax2$Tax %in% 
    card_tax$Tax, ]
refseq_tax2_card3 = card_tax[card_tax$Tax %in% refseq_tax2_card2$Tax, 
    ]
colnames(refseq_tax2_card2) = c("Rn", "RTax", "Rfreq", 
    "Rdb")
colnames(refseq_tax2_card3) = c("Cn", "CTax", "Cfreq", 
    "Cdb")
common_tax = cbind(refseq_tax2_card3, refseq_tax2_card2)

# Calculate the difference in relative abundance
# between the two databases
common_tax$diff = as.numeric(as.character(common_tax$Cfreq)) - 
    as.numeric(as.character(common_tax$Rfreq))

# Select Genera with a relative difference larger
# than 0.5%
common_tax2 = common_tax[which(abs(common_tax$diff) > 
    0.5), ]
common_tax2$ndiff = common_tax2$Cn - common_tax2$Rn
for_kable = common_tax2 %>% select(2, 1, 3, 5, 7, 10, 
    9)
colnames(for_kable) = c("Genus", "#Card", "%Card", 
    "#RefSeq", "%RefSeq", "#Diff.", "%Diff.")
kable(for_kable)
Genus #Card %Card #RefSeq %RefSeq #Diff. %Diff.
3 Acinetobacter 416 15.8536585 259 1.6403825 157 14.2132760
6 Aeromonas 41 1.5625000 63 0.3990120 -22 1.1634880
12 Bacillus 33 1.2576220 630 3.9901197 -597 -2.7324978
15 Bifidobacterium 3 0.1143293 138 0.8740262 -135 -0.7596970
16 Bordetella 1 0.0381098 746 4.7248084 -745 -4.6866987
18 Brachyspira 15 0.5716463 9 0.0570017 6 0.5146446
21 Brucella 1 0.0381098 123 0.7790234 -122 -0.7409136
22 Burkholderia 13 0.4954268 273 1.7290519 -260 -1.2336250
25 Campylobacter 28 1.0670732 275 1.7417189 -247 -0.6746457
29 Chryseobacterium 23 0.8765244 49 0.3103426 -26 0.5661817
30 Citrobacter 114 4.3445122 80 0.5066819 34 3.8378303
33 Corynebacterium 5 0.1905488 248 1.5707138 -243 -1.3801650
37 Elizabethkingia 19 0.7240854 31 0.1963392 -12 0.5277461
39 Enterobacter 88 3.3536585 135 0.8550257 -47 2.4986329
42 Enterococcus 86 3.2774390 196 1.2413706 -110 2.0360684
44 Escherichia 364 13.8719512 1018 6.4475268 -654 7.4244245
55 Helicobacter 1 0.0381098 195 1.2350371 -194 -1.1969273
59 Klebsiella 427 16.2728659 502 3.1794287 -75 13.0934371
61 Lactobacillus 1 0.0381098 405 2.5650770 -404 -2.5269672
64 Legionella 4 0.1524390 120 0.7600228 -116 -0.6075838
65 Listeria 7 0.2667683 216 1.3680410 -209 -1.1012727
67 Mannheimia 1 0.0381098 90 0.5700171 -89 -0.5319073
73 Mycobacterium 6 0.2286585 264 1.6720502 -258 -1.4433916
77 Neisseria 8 0.3048780 185 1.1717018 -177 -0.8668238
92 Proteus 47 1.7911585 33 0.2090063 14 1.5821523
94 Pseudomonas 255 9.7179878 550 3.4834378 -295 6.2345500
102 Salmonella 63 2.4009146 827 5.2378238 -764 -2.8369092
103 Serratia 30 1.1432927 94 0.5953512 -64 0.5479415
107 Staphylococcus 73 2.7820122 619 3.9204509 -546 -1.1384388
110 Streptococcus 17 0.6478659 646 4.0914561 -629 -3.4435902
111 Streptomyces 54 2.0579268 187 1.1843689 -133 0.8735580
# Red = more abundant in CARD than RefSeq Select
# the most extreme cases for highlighting
Genus = c("Acinetobacter", "Klebsiella", "Escherichia", 
    "Salmonella", "Streptococcus", "Bordetella")
CARD_abund = c("15.85%", "16.27%", "13.87%", "2.40%", 
    "0.65%", "0.04%")
RefSeq_adund = c("1.62%", "3.18%", "6.45%", "5.24%", 
    "4.09%", "4.72%")

df_table = data.frame(CARD_abund, RefSeq_adund)
rownames(df_table) = Genus
colnames(df_table) = c("CARD\nAbundance", "RefSeq\nAbundance")

# Plot genera with difference relative abundance
# between the databases
ggplot(common_tax2, aes(x = reorder(CTax, -diff), diff, 
    fill = Cn)) + geom_bar(stat = "identity") + theme_light() + 
    scale_fill_continuous(high = "red", low = "blue", 
        name = "CARD abundance") + theme(axis.text.x = element_text(angle = 45, 
    hjust = 1)) + labs(x = "Genus", y = "Relative abundance difference") + 
    annotation_custom(tableGrob(df_table, theme = ttheme_default(base_size = 8)), 
        xmin = -10, xmax = 50, ymin = 3, ymax = 14)

Distribution of ARG DIAMOND hits passing either RGI filter or custom >80% ID (and >80% query coverage)

The RGI filter is a manually curated bitscore cutoff per ARO category. These numbers appear very arbitrary and rounded off considering they are bitscore values. An amino acid sequence might for example have a bitscore of 1399 to an ARO reference protein, but the curated RGI cutoff is set to 1400. Obviously, this query amino acid sequence represents a very good hit, but it just fails to meet the RGI bitscore cutoff. For cases like this, the >80% ID filter was included. The code below makes a figure that represents hits passing and failing the two cutoffs. For both filters, a minimum of 80% query coverage was set.

# Import diamond hits that either have a higher
# bitscore than RGI cutoff or are at least 80%
# similar. (default query/subject coverage is 80%)
# These files were created with simple parsing of
# DIAMOND hits in bash
passed_rgi = read.csv("diamond_hits_rgi_cutoff.pass", 
    header = F, sep = "\t")
failed_rgi = read.csv("diamond_hits_rgi_cutoff.fail", 
    header = F, sep = "\t")
# The files are very large - subset to 10000 random
# hits for both passing and failing hits
nrow(passed_rgi)
## [1] 176888
nrow(failed_rgi)
## [1] 1164572
passed_rgi_sub = sample_n(passed_rgi, 10000)
failed_rgi_sub = sample_n(failed_rgi, 10000)

# Combine passed and failed to one data frame
both_rgi_sub = rbind(passed_rgi_sub, failed_rgi_sub)

p = ggplot(both_rgi_sub, aes(V3, V15, color = V13)) + 
    xlim(0, 100) + geom_rect(xmin = 0, xmax = 100, 
    ymin = 0, ymax = 3, fill = "#99FF66") + geom_rect(xmin = 80, 
    xmax = 100, ymin = 0, ymax = 1, fill = "#FFA500") + 
    geom_rect(xmin = 0, xmax = 80, ymin = 0, ymax = 1, 
        fill = "#FF6633") + geom_point(alpha = 0.6) + 
    theme_bw() + labs(x = "% ID", y = "Bitratio", color = "% Qcov")

ggExtra::ggMarginal(p + theme(legend.position = "left"), 
    type = "histogram")

# Investigate the taxonomy of hits passing either
# RGI score or > 80% ID
rgi_pass = read.csv2("rgi_pass.tax", sep = "\t", header = F)
ID_pass = read.csv2("rgi_fail_ID_pass.tax", sep = "\t", 
    header = F)
rgi_pass$filter = "RGI bitscore"
ID_pass$filter = "ID > 80%"
both_pass = rbind(rgi_pass, ID_pass)
colnames(both_pass) = c("Superkingdom", "Kingdom", 
    "Phylum", "Class", "Order", "Family", "Genus", 
    "Filter")

both_pass2 = both_pass %>% count(Phylum, Class, Order, 
    Filter) %>% mutate(freq = (n/sum(n) * 100))

major_orders = both_pass2 %>% filter(freq > 0.2)

ggplot(major_orders, aes(Order, freq, fill = Filter)) + 
    geom_bar(stat = "identity", position = "dodge") + 
    facet_grid(~Class, scales = "free_x") + theme_light() + 
    theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 
    labs(y = "Relative frequency") + scale_fill_brewer(palette = "Paired")

minor_orders = both_pass2 %>% filter(freq < 0.2) %>% 
    filter(n > 10)

ggplot(minor_orders, aes(Order, freq, fill = Filter)) + 
    geom_bar(stat = "identity", position = "dodge") + 
    facet_wrap(~Class, scales = "free_x", nrow = 2) + 
    theme_light() + theme(axis.text.x = element_text(angle = 90, 
    hjust = 1)) + labs(y = "Relative frequency") + 
    scale_fill_brewer(palette = "Paired")

Inspect the impact of clustering to CRLs

dist_data=read.csv("ARG_IS_distances_mech.txt",sep = "\t",header=F)
colnames(dist_data) = c("Region","ARO","Updist","Downdist","IS_Up","IS_Down","Replicon",
                        "Sim_cutoff","Mean_dist","Cluster_size","Mechanism","Submechanism")
dist_data$Cluster_size = gsub("size_","",dist_data$Cluster_size)

#For each region with an ARG, there can be up to two closest IS element: upstream and downstream. The distance to these are denoted as Updist and Downdist, while the Mean_dist column is the mean of the two. A distance of 0 indicates that no IS element was found. 

#Cluster_size refers to how many individual regions were clustered to form a given CRL Region.

dist_data2 = dist_data
dist_data2$ARO = gsub("ARO:","",dist_data2$ARO)

#Make a new dataframe where various metric are calculated and summarized

all_aros = unique(as.vector(dist_data2$ARO))
df <- data.frame(ARO=character(),
                 Occurrences=numeric(), 
                 Ratio=numeric(), 
                 Mean_distance=numeric(),
                 SD_distance=numeric(),
                 Mechanism=character(),
                 Submechanism=character(),
                 Zero_IS=numeric(),
                 Replicon=character(),
                 Total_seqs=numeric())

for (ARO in all_aros){
  temp <- dist_data2[which(dist_data2$ARO==ARO), ]
  Zero_IS = nrow(temp[which(temp$Mean_dist == 0),])
  number_nonzero = nrow(temp[which(temp$Mean_dist > 0),])
  Ratio = number_nonzero/nrow(temp)
  Occurrences = nrow(temp)
  temp_nonzero = temp[which(temp$Mean_dist > 0),]
  Mean_distance = mean(temp_nonzero$Mean_dist)
  SD_distance = sd(temp_nonzero$Mean_dist)
  Mechanism = unique(as.vector(temp$Mechanism))
  IS_Up = unique(as.vector(temp$IS_Up))
  IS_Down = unique(as.vector(temp$IS_Down))
  Submechanism = unique(as.vector(temp$Submechanism))
  p_rep = nrow(temp[which(temp$Replicon == "Plasmid"),])
  c_rep = nrow(temp[which(temp$Mean_dist == "Chr"),])
  Replicon = p_rep/nrow(temp)
  Total_seqs = sum(as.numeric(temp$Cluster_size))
  temp2 = as.data.frame(cbind(ARO,Occurrences,Ratio,Mean_distance,SD_distance,Mechanism,Submechanism,Zero_IS,Replicon,Total_seqs))
  df = rbind(df,temp2)
}


#Make sure that values in columns have a correct (numeric) format
df$Occurrences = as.numeric(as.character(df$Occurrences))
df$Mean_distance = as.numeric(as.character(df$Mean_distance))
df$SD_distance = as.numeric(as.character(df$SD_distance))
df$Ratio = as.numeric(as.character(df$Ratio))
df$Zero_IS = as.numeric(as.character(df$Zero_IS))
df$Replicon = as.numeric(as.character(df$Replicon))
df$Total_seqs = as.numeric(as.character(df$Total_seqs))



#Check the number of CRLs and AROs
dist_data %>%
  group_by(Mechanism) %>%
  tally() %>%
  arrange(desc(n))
## # A tibble: 9 x 2
##   Mechanism                                                      n
##   <fct>                                                      <int>
## 1 antibiotic_efflux                                          28953
## 2 antibiotic_inactivation                                    13763
## 3 antibiotic_target_alteration                                4856
## 4 antibiotic_target_replacement                               3360
## 5 antibiotic_target_protection                                1428
## 6 antibiotic_efflux;reduced_permeability_to_antibiotic         962
## 7 antibiotic_target_alteration;antibiotic_target_replacement   276
## 8 antibiotic_target_alteration;antibiotic_efflux               155
## 9 reduced_permeability_to_antibiotic                           142
unique(df) %>%
  group_by(Mechanism) %>%
  tally() %>%
  arrange(desc(n))
## # A tibble: 9 x 2
##   Mechanism                                                      n
##   <fct>                                                      <int>
## 1 antibiotic_inactivation                                      718
## 2 antibiotic_efflux                                            225
## 3 antibiotic_target_alteration                                 128
## 4 antibiotic_target_protection                                  58
## 5 antibiotic_target_replacement                                 39
## 6 reduced_permeability_to_antibiotic                             3
## 7 antibiotic_efflux;reduced_permeability_to_antibiotic           2
## 8 antibiotic_target_alteration;antibiotic_target_replacement     2
## 9 antibiotic_target_alteration;antibiotic_efflux                 1
#Plot the "compression" of CRLs: How many individual were clustered into the respective CRLs.
p = ggplot(df, aes(x = Total_seqs, y = Occurrences, color = 100-(Occurrences/Total_seqs)*100)) +
  geom_point(position = position_jitter(width = 0.05,height = 0.01)) +
  scale_y_log10() +
  scale_x_log10() +
  theme_light() +
  theme(legend.position=c(.2,.7),
        legend.key.size = unit(0.4, "cm"),
        legend.background = element_rect(colour = "black")) +
  labs(x ="Total unclustered regions",y = "Number of CRLs", color = "% CRL\ncompression") +
  scale_color_gradient(high = "red",low = "blue") +
  ggtitle("B: CRL compression")


pcompress1 = print(ggMarginal(p, type = "violin"))

pcompress1
## NULL
#Also make an interactive version of the plot
mytext = paste("ARO = ",df$ARO,"\n","Total unclustered regions = ",df$Total_seqs,"\n","Number of CRLs = ",df$Occurrences,"\n","Mechanism = ",df$Mechanism)
pp = plotly_build(p)
style(pp, text = mytext, hoverinfo = "text")
df2 = df
#Rename mechanisms
df2$Mechanism = gsub("antibiotic_efflux","Antibiotic efflux",df2$Mechanism)
df2$Mechanism = gsub("antibiotic_target_replacement","Antibiotic target replacement",df2$Mechanism)
df2$Mechanism = gsub("antibiotic_target_alteration","Antibiotic target alteration",df2$Mechanism)
df2$Mechanism = gsub("antibiotic_inactivation","Antibiotic inactivation",df2$Mechanism)
df2$Mechanism = gsub("reduced_permeability_to_antibiotic","Reduced permeability to antibiotic",df2$Mechanism)
df2$Mechanism = gsub("antibiotic_target_protection","Antibiotic target protection",df2$Mechanism)


#Test CRL compression per mechanism. Select only major mechanisms for testing
df3 = df2 %>%
  filter(Mechanism == "Antibiotic efflux" |
           Mechanism == "Antibiotic target replacement" |
           Mechanism == "Antibiotic target alteration" |
           Mechanism == "Antibiotic inactivation" |
           Mechanism == "Antibiotic target protection")

df3$Compression = 100-(df3$Occurrences/df3$Total_seqs)*100

#Perform pairwise Mann-Whitney tests (unpaired Wilcoxon Rank Sum) on the compression rates
comp_means = compare_means(Compression ~ Mechanism, data = df3, method = "wilcox.test",paired = F,p.adjust.method = "fdr") %>%
  #mutate(y_pos = 2) %>%
  filter(p.adj < 0.05)

#If plyr is loaded after dplyr it messes up some dplyr functions.
#detach(package:plyr)

means = df3 %>%
  group_by(Mechanism) %>%
  drop_na(Compression) %>%
  summarise(mean_compression = mean(Compression))
colnames(means) = c("group1","mean1")

comp_means = comp_means %>%
  full_join(means, by = "group1") %>%
  drop_na(p) %>%
  arrange(desc(mean1))

colnames(means) = c("group2","mean2")
comp_means = comp_means %>%
  full_join(means, by = "group2") %>%
  drop_na(p) %>%
  arrange(desc(mean1),desc(mean2))

comp_means$y_pos = seq(1, 0.9+nrow(comp_means)*10, 10)
comp_means$y_pos = comp_means$y_pos+100
for(i in 1:nrow(comp_means)){
  if (comp_means$p.adj[i] < 0.0001){
    comp_means$p.signif[i]="****"}
  else if (comp_means$p.adj[i] < 0.001){
    comp_means$p.signif[i]="***"}
  else if (comp_means$p.adj[i] < 0.01){
    comp_means$p.signif[i]="**"}
  else if (comp_means$p.adj[i] < 0.05){
    comp_means$p.signif[i]="*"}
  else if(comp_means$p.adj[i] > 0.05){
    comp_means$p.signif[i]="ns"}
}



pcompress2 = ggplot(df3, aes(x = reorder(Mechanism, -(100-(Occurrences/Total_seqs))*100, FUN = mean),
                             y =  100-(Occurrences/Total_seqs)*100, color = Mechanism)) +
  geom_boxplot(outlier.shape = NA) +
  stat_summary(fun.y = mean, geom = "errorbar", aes(ymax = ..y.., ymin = ..y..),
               width = 0.75, size = 1.5, linetype = "dashed", color = "blue") +
  geom_jitter(size =0.5) +
  theme_light() +
  ggsignif::geom_signif(data=comp_means, aes(xmin=group1, xmax=group2, annotations=p.signif, y_position=y_pos), manual=T, color = "black", tip_length = 0,vjust=0.5) +
  theme(axis.text.x = element_text(angle = 45,hjust = 1), legend.position = "none") +
  labs(y = "CRL compression", x = "Mechanism") +
  ggtitle("C: Compression rate per mechanism") +
  scale_y_continuous(labels= seq(0,100,25), breaks= seq(0,100,25), limits = c(0,140)) 

pcompress2

#Import taxonomy data to see how clustering affect the relative abundance of genera
aro_tax = read.csv("aro_tax.Rdat",sep="\t",header =F)

colnames(aro_tax) = c("Region","ARO","Up_dist","Down_dist","IS_up","IS_down",
                      "Replicon","Sim_cutoff","Mean_dist","Cluster_size","Mechanism","Submechanism","Acc.","Superkingdom",
                      "Kingdom","Phylum","Class","Order","Family","Genus")
aro_tax$Superkingdom = as.character(aro_tax$Superkingdom)
aro_tax$ARO = gsub("ARO:","",aro_tax$ARO)
#If superkingdom is empty, remove the line
#aro_tax <- aro_tax[-which(aro_tax$Superkingdom == ""), ]
#Remove plasmids
aro_tax <- aro_tax[-which(aro_tax$Replicon == "Plasmid"), ]


#Compare relative abundance of genera with those in RefSeq complete genome db
refseq_tax = read.csv("refseq_1st_chr.tax", header = F, sep = "\t")
colnames(refseq_tax) = c("Acc.","TaxID","Superkingdom","Kingdom","Phylum","Class","Order","Family","Genus")

clust_c_tax = read.csv("post_clust_tax_c", header = F, sep = "\t")
#Each replicon can have multiple ARGs. Sort-unique the lines to only get one line per replicon
clust_c_uniq = clust_c_tax[!duplicated(clust_c_tax),]
colnames(clust_c_uniq) = c("Acc.","Superkingdom","Kingdom","Phylum","Class","Order","Family","Genus")

clust_c_tax2 = as.data.frame(clust_c_uniq %>%
                               group_by(Genus) %>%
                               tally() %>%
                               mutate(Cfreq = (n/sum(n))*100))


refseq_tax2 = as.data.frame(refseq_tax %>%
                              group_by(Genus) %>%
                              tally() %>%
                              mutate(freq = (n / sum(n))*100))

aro_tax2 = as.data.frame(aro_tax %>%
                           group_by(Genus) %>%
                           tally() %>%
                           #count(Genus) %>%
                              mutate(freq = (n / sum(n))*100))
colnames(refseq_tax2) = c("RTax","Rn","Rfreq")
colnames(aro_tax2) = c("ATax","An","Afreq")
colnames(clust_c_tax2) = c("CTax","Cn","Cfreq")

#Get only RefSeq entries that also occur in aro_tax
refseq_in_aro = refseq_tax2[refseq_tax2$RTax %in% aro_tax2$ATax,]
refseq_in_clust = refseq_tax2[refseq_tax2$RTax %in% clust_c_tax2$CTax,]
aro_in_refseq = aro_tax2[aro_tax2$ATax %in% refseq_tax2$RTax,]
clust_in_refseq = clust_c_tax2[clust_c_tax2$CTax %in% refseq_tax2$RTax,]

aro_in_refseq = aro_tax2[aro_tax2$ATax %in% refseq_in_aro$RTax,]
clust_in_refseq = clust_c_tax2[clust_c_tax2$CTax %in% refseq_in_aro$RTax,]

common_tax_aro = cbind(refseq_in_aro,aro_in_refseq)
common_tax_clust = cbind(refseq_in_clust,clust_in_refseq)

common_tax_aro$diff = as.numeric(as.character(common_tax_aro$Afreq)) - as.numeric(as.character(common_tax_aro$Rfreq))
common_tax_clust$diff = as.numeric(as.character(common_tax_clust$Cfreq)) - as.numeric(as.character(common_tax_clust$Rfreq))

data_lollipop_aro = as.data.frame(cbind(as.character(common_tax_aro$ATax),
                                    as.numeric(as.character(common_tax_aro$Afreq)),
                                    as.numeric(as.character(common_tax_aro$Rfreq))))

colnames(data_lollipop_aro) = c("Tax","ARG","RefSeq")

data_lollipop_clust = as.data.frame(cbind(as.character(common_tax_clust$CTax),
                                        as.numeric(as.character(common_tax_clust$Cfreq)),
                                        as.numeric(as.character(common_tax_clust$Rfreq))))

colnames(data_lollipop_clust) = c("Tax","Clustered","RefSeq")


data_aro <- data_lollipop_aro %>% 
  rowwise() %>% 
  mutate(mymean = rowMeans(cbind(as.numeric(as.character(ARG)),as.numeric(as.character(RefSeq))))) %>% 
  arrange(mymean) %>%
  filter(as.numeric(as.character(ARG)) > 1 | as.numeric(as.character(RefSeq)) > 1) %>%
  arrange(desc(mymean)) %>%
  mutate(x=factor(Tax, Tax))

data_clust <- data_lollipop_clust %>% 
  rowwise() %>% 
  mutate(mymean = rowMeans(cbind(as.numeric(as.character(Clustered)),as.numeric(as.character(RefSeq))))) %>% 
  arrange(mymean) %>%
  filter(as.numeric(as.character(Clustered)) > 1 | as.numeric(as.character(RefSeq)) > 1) %>%
  arrange(desc(mymean)) %>%
  mutate(x=factor(Tax, Tax))


data_clust_aro = data_clust %>% full_join(data_aro, by = "Tax") %>%
  select(1,2,3,4,6,8)

#CARD db taxonomy was found with the linux terminal command: 
#fasta_formatter -i protein_fasta_protein_homolog_model.fasta -t | grep -o "\[.*\]" | awk '{print $1}' | sed -e 's/\[//' -e 's/\]//' | sort | uniq -c | sed -e 's/^ *//' > protein_homolog.tax
card_tax = read.csv("protein_homolog.tax", header = F, sep = " ")
colnames(card_tax) = c("n","Tax")
card_tax$freq = card_tax$n/sum(card_tax$n)*100
card_tax$db = "CARD"
card_tax$Tax = as.character(card_tax$Tax)


data_clust_aro_card = as.data.frame(data_clust_aro) %>% 
  full_join(card_tax, by = "Tax") %>%
  select(1,2,3,4,5,6,8) %>%
  slice(1:nrow(data_clust_aro))


plolli = ggplot(data_clust_aro_card) +
  geom_segment( aes(x=reorder(Tax,mymean.x), xend=reorder(Tax,mymean.x), 
                    y=as.numeric(as.character(Clustered)), yend=as.numeric(as.character(ARG))), color="grey") +
  geom_point( aes(x=reorder(Tax,mymean.x), y=as.numeric(as.character(Clustered))), color="forestgreen", size=3, alpha = 0.6 ) +
  geom_point( aes(x=reorder(Tax,mymean.x), y=as.numeric(as.character(ARG))), color="blue", size=3, alpha = 0.6 ) +
  geom_point( aes(x=reorder(Tax,mymean.x), y=as.numeric(as.character(RefSeq.x))), color="black", size=3, shape = 2 ) +
  geom_point( aes(x=reorder(Tax,mymean.x), y=as.numeric(as.character(freq))), color="red", size=3, shape = 2 ) +
  coord_flip()+
  theme_light() +
  theme(axis.text.y = element_text(face = "italic")) +
  xlab("") +
  ylab("Percentage of data") +
  ylim(0.01,35) +
  ggtitle("A: Relative abundance of top genera")


#Get the legend only
data_clust_aro_card_melt = melt(data_clust_aro_card,id.vars = "Tax", 
                                measure.vars = c("ARG","Clustered","RefSeq.x","freq"))

legend <- cowplot::get_legend(ggplot(data_clust_aro_card_melt, aes(x=Tax,y=as.numeric(as.character(value)),color = variable, shape = variable)) +
  geom_point() +
  coord_flip() +
  theme_light() +
  scale_color_manual(labels = c("Unclustered loci","Clustered loci (CRLs)","RefSeq","CARD"),values = c("blue","forestgreen","black","red")) +
  scale_shape_manual(labels = c("Unclustered loci","Clustered loci (CRLs)","RefSeq","CARD"),values = c(16,16,2,2)) +
  theme(legend.title = element_blank(), 
        legend.key.size = unit(0.6, "cm"),
        legend.text = element_text(size = 10),
        legend.background = element_rect(colour = "black")) +
  guides(color = guide_legend(override.aes = list(size=4))))

plolli2 = plolli + annotation_custom(legend,xmin = -20,ymin=10)

plolli2

#Get Euclidean distances
common_tax_aro_df = as.data.frame(cbind(common_tax_aro$Rfreq,common_tax_aro$Afreq))
colnames(common_tax_aro_df) = c("RefSeq freq.","Card freq.")
#Before clustering to CRLs, the euclidean distance between relative abundance of genera found in both databases is:
dist(t(common_tax_aro_df), method = "euclidean")
##            RefSeq freq.
## Card freq.     30.89466
common_tax_clust_df = as.data.frame(cbind(common_tax_clust$Rfreq,common_tax_clust$Cfreq))
colnames(common_tax_clust_df) = c("RefSeq freq.","CRL freq.")
#After clustering to CRLs, the euclidean distance is reduced to:
dist(t(common_tax_clust_df), method = "euclidean")
##           RefSeq freq.
## CRL freq.     10.26442
#The combined plot is not displayed properly in Rmarkdown, but is called with
#grid.arrange(plolli2,pcompress1, pcompress2, layout_matrix = rbind(c(1,2),c(1,3)))

#What is the accumulated percentage of the datasets?
paste("CRL:",sum(as.numeric(as.character(data_clust_aro_card$Clustered))),"%")
## [1] "CRL: 78.426906069945 %"
paste("Unclustered loci:",sum(as.numeric(as.character(data_clust_aro_card$ARG)),na.rm = T),"%")
## [1] "Unclustered loci: 92.6771590340765 %"
paste("CARD:",sum(as.numeric(as.character(data_clust_aro_card$freq)),na.rm = T),"%")
## [1] "CARD: 81.5167682926829 %"
paste("RefSeq:",sum(as.numeric(as.character(data_clust_aro_card$RefSeq.x)),na.rm = T),"%")
## [1] "RefSeq: 61.6378491354741 %"

Main analyses

# Filter low-occurring entries for plotting
df_filt = df[which(df$Occurrences > 2), ]
# Remove hybrid mechanisms
df_filt = subset(df_filt, !(Mechanism == "antibiotic_target_alteration;antibiotic_target_replacement" | 
    Mechanism == "antibiotic_target_alteration;antibiotic_efflux" | 
    Mechanism == "antibiotic_efflux;reduced_permeability_to_antibiotic"))
# Rename major mechanisms
df_filt$Mechanism = gsub("antibiotic_efflux", "Antibiotic efflux", 
    df_filt$Mechanism)
df_filt$Mechanism = gsub("antibiotic_target_replacement", 
    "Antibiotic target replacement", df_filt$Mechanism)
df_filt$Mechanism = gsub("antibiotic_target_alteration", 
    "Antibiotic target alteration", df_filt$Mechanism)
df_filt$Mechanism = gsub("antibiotic_inactivation", 
    "Antibiotic inactivation", df_filt$Mechanism)
df_filt$Mechanism = gsub("reduced_permeability_to_antibiotic", 
    "Reduced permeability to antibiotic", df_filt$Mechanism)
df_filt$Mechanism = gsub("antibiotic_target_protection", 
    "Antibiotic target protection", df_filt$Mechanism)


# Produce figure 4
p = ggplot(df_filt, aes(Ratio, Occurrences, color = Replicon)) + 
    scale_y_log10() + stat_density_2d(aes(fill = stat(nlevel)), 
    geom = "polygon", n = 200) + facet_wrap(. ~ Mechanism) + 
    scale_fill_viridis_c("Scaled\ndensity", option = "E", 
        alpha = 0.6) + theme_light() + geom_point(alpha = 0.8) + 
    scale_color_continuous(high = "red", low = "blue") + 
    labs(color = "Replicon\nratio", y = "Number of unique CRLs in AROs (log10)", 
        x = "IS ratio")

plotly_build(p)
# Instead of unique CRLs in AROs on Y-axis, plot
# the mean distance (of upstream and downstream ARG
# to IS distance). This produces supplemental
# figure S9 which shows that Antibiotic Efflux
# genes are further distanced from IS elements than
# the other mechanisms.
p_dist_dens = ggplot(df_filt, aes(Ratio, Mean_distance, 
    color = Replicon)) + stat_density_2d(aes(fill = stat(nlevel)), 
    geom = "polygon", n = 200) + facet_wrap(. ~ Mechanism) + 
    scale_fill_viridis_c("Scaled\ndensity", option = "E", 
        alpha = 0.6) + theme_light() + geom_point(alpha = 0.8) + 
    scale_color_continuous(high = "red", low = "blue") + 
    labs(color = "Replicon\nratio", y = "Mean distance to closest IS element (bp)", 
        x = "IS ratio") + ggtitle("A: Density plots of IS ratio against ARG-IS distance")


# Test the density estimates.
df_filt2 = subset(df, !(Mechanism == "antibiotic_target_alteration;antibiotic_target_replacement" | 
    Mechanism == "antibiotic_target_alteration;antibiotic_efflux" | 
    Mechanism == "antibiotic_efflux;reduced_permeability_to_antibiotic"))
df_filt2$Mechanism = gsub("antibiotic_efflux", "Antibiotic efflux", 
    df_filt2$Mechanism)
df_filt2$Mechanism = gsub("antibiotic_target_replacement", 
    "Antibiotic target replacement", df_filt2$Mechanism)
df_filt2$Mechanism = gsub("antibiotic_target_alteration", 
    "Antibiotic target alteration", df_filt2$Mechanism)
df_filt2$Mechanism = gsub("antibiotic_inactivation", 
    "Antibiotic inactivation", df_filt2$Mechanism)
df_filt2$Mechanism = gsub("reduced_permeability_to_antibiotic", 
    "Reduced permeability to antibiotic", df_filt2$Mechanism)
df_filt2$Mechanism = gsub("antibiotic_target_protection", 
    "Antibiotic target protection", df_filt2$Mechanism)

comp_means = compare_means(Mean_distance ~ Mechanism, 
    data = df_filt2, method = "wilcox.test", paired = F, 
    p.adjust.method = "fdr") %>% # mutate(y_pos = 2) %>%
filter(p.adj < 0.05)

means = df_filt2 %>% group_by(Mechanism) %>% drop_na(Mean_distance) %>% 
    summarise(mean_ARGMOB = mean(Mean_distance))
colnames(means) = c("group1", "mean1")

comp_means = comp_means %>% full_join(means, by = "group1") %>% 
    drop_na(p) %>% arrange(desc(mean1))

colnames(means) = c("group2", "mean2")
comp_means = comp_means %>% full_join(means, by = "group2") %>% 
    drop_na(p) %>% arrange(desc(mean1), desc(mean2))

comp_means$y_pos = seq(1, 0.9 + nrow(comp_means) * 
    800, 800)
comp_means$y_pos = comp_means$y_pos + 12500
for (i in 1:nrow(comp_means)) {
    if (comp_means$p.adj[i] < 1e-04) {
        comp_means$p.signif[i] = "****"
    } else if (comp_means$p.adj[i] < 0.001) {
        comp_means$p.signif[i] = "***"
    } else if (comp_means$p.adj[i] < 0.01) {
        comp_means$p.signif[i] = "**"
    } else if (comp_means$p.adj[i] < 0.05) {
        comp_means$p.signif[i] = "*"
    } else if (comp_means$p.adj[i] > 0.05) {
        comp_means$p.signif[i] = "ns"
    }
}



p_dist_test = ggplot(df_filt2, aes(Mechanism, Mean_distance)) + 
    geom_boxplot() + ggsignif::geom_signif(data = comp_means, 
    aes(xmin = group1, xmax = group2, annotations = p.signif, 
        y_position = y_pos), manual = T, color = "black", 
    tip_length = 0, vjust = 0.5) + theme_light() + 
    theme(axis.text.x = element_text(angle = 45, hjust = 1)) + 
    labs(y = "ARG-IS distance (bases)") + scale_y_continuous(labels = seq(0, 
    12500, 2500), breaks = seq(0, 12500, 2500), limits = c(0, 
    15000)) + stat_summary(fun.y = mean, geom = "errorbar", 
    aes(ymax = ..y.., ymin = ..y..), width = 0.75, 
    size = 1, linetype = "solid", color = "blue") + 
    ggtitle("B: MWU test for\n ARG-IS distances")


# Produce supplemental figure S9
grid.arrange(p_dist_dens, p_dist_test, layout_matrix = cbind(c(1), 
    c(2)), widths = c(2, 0.7))

# Investigate distribution of families of IS
# elements Get percentage abundance of all IS
# families per major mechanism
tnp_table2 = dist_data2 %>% gather("key", "value", 
    IS_Up, IS_Down) %>% group_by(Mechanism, ARO, value) %>% 
    summarise(n = n()) %>% mutate(freq = n/sum(n) * 
    100)

tnp_table3 = data.frame(tnp_table2 %>% filter(Mechanism != 
    "antibiotic_efflux;reduced_permeability_to_antibiotic") %>% 
    filter(Mechanism != "antibiotic_target_alteration;antibiotic_efflux") %>% 
    filter(Mechanism != "antibiotic_target_alteration;antibiotic_target_replacement"))

foo <- data.frame(do.call("rbind", strsplit(as.character(tnp_table3$value), 
    ":", fixed = TRUE)))
colnames(foo) = c("IS_element", "IS_family")
tnp_table3 = cbind(tnp_table3, foo)

# Renme mechanisms
tnp_table3$Mechanism = gsub("antibiotic_efflux", "Antibiotic efflux", 
    tnp_table3$Mechanism)
tnp_table3$Mechanism = gsub("antibiotic_target_replacement", 
    "Antibiotic target replacement", tnp_table3$Mechanism)
tnp_table3$Mechanism = gsub("antibiotic_target_alteration", 
    "Antibiotic target alteration", tnp_table3$Mechanism)
tnp_table3$Mechanism = gsub("antibiotic_inactivation", 
    "Antibiotic inactivation", tnp_table3$Mechanism)
tnp_table3$Mechanism = gsub("reduced_permeability_to_antibiotic", 
    "Reduced permeability to antibiotic", tnp_table3$Mechanism)
tnp_table3$Mechanism = gsub("antibiotic_target_protection", 
    "Antibiotic target protection", tnp_table3$Mechanism)

# Plot only the top 12 IS families (most abundant)
top12_is = tnp_table3 %>% group_by(IS_family) %>% tally() %>% 
    arrange(desc(n)) %>% head(n = 18) %>% select(1)
top12_is = as.character(top12_is$IS_family)
tnp_table4 = tnp_table3[tnp_table3$IS_family %in% top12_is, 
    ]

# Do Wilcoxon Rank Sum tests (Mann-Whitney) on IS
# families per major mechanism
anno_df2 = compare_means(freq ~ Mechanism, group.by = "IS_family", 
    data = tnp_table4, method = "wilcox.test", paired = F, 
    p.adjust.method = "fdr") %>% filter(p.adj < 0.05)

# Convert p values to asterisks indicating
# significance levels
for (i in 1:nrow(anno_df2)) {
    if (anno_df2$p.adj[i] < 1e-04) {
        anno_df2$p.signif[i] = "****"
    } else if (anno_df2$p.adj[i] < 0.001) {
        anno_df2$p.signif[i] = "***"
    } else if (anno_df2$p.adj[i] < 0.01) {
        anno_df2$p.signif[i] = "**"
    } else if (anno_df2$p.adj[i] < 0.05) {
        anno_df2$p.signif[i] = "*"
    } else if (anno_df2$p.adj[i] > 0.05) {
        anno_df2$p.signif[i] = "ns"
    }
}

# Get a list of the tested IS families. Loop
# through them and make Y-coordinates for plotting
# significance bars.
temp_is = unique(anno_df2$IS_family)
anno_df5 = data.frame(IS_family = character(), .y. = character(), 
    group1 = character(), group2 = character(), p = character(), 
    p.adj = character(), p.format = character(), p.signif = character(), 
    method = character())
for (i in temp_is) {
    tempdf = anno_df2 %>% filter(IS_family == i)
    tempdf$ypos = head(seq(2.1, 5.1, 0.5), n = nrow(tempdf))
    anno_df5 = rbind(anno_df5, tempdf)
}


tnp_table4 = tnp_table3[tnp_table3$IS_family %in% temp_is, 
    ]
# Convert % abundance to log10(%)
tnp_table4$freq_log = log10(tnp_table4$freq)

# Produce supplemental figure S12
is_fam_plot = ggplot(tnp_table4, aes(Mechanism, freq_log)) + 
    geom_boxplot(outlier.shape = NA) + theme_light() + 
    theme(axis.text.x = element_text(angle = 45, hjust = 1), 
        panel.grid.minor.y = element_blank()) + facet_wrap(~IS_family, 
    ncol = 6) + ggsignif::geom_signif(data = anno_df5, 
    aes(xmin = group1, xmax = group2, annotations = p.signif, 
        y_position = ypos), manual = TRUE, tip_length = 0, 
    vjust = 0.5) + scale_y_continuous(breaks = c(0, 
    0.5, 1, 1.5, 2), limits = c(0, 4.8)) + labs(y = "Abundance of IS families in association\n with mechanisms (log10 of %)")
is_fam_plot
## Warning: Removed 3535 rows containing non-finite values (stat_boxplot).

Investigate IS and replicon ratios in different taxonomic groups

# Import data where taxonomic information has been
# included per ARG region
set.seed(1)
aro_tax = read.csv("aro_tax.Rdat", sep = "\t", header = F)

colnames(aro_tax) = c("Region", "ARO", "Up_dist", "Down_dist", 
    "IS_up", "IS_down", "Replicon", "Sim_cutoff", "Mean_dist", 
    "Cluster_size", "Mechanism", "Submechanism", "Acc.", 
    "Superkingdom", "Kingdom", "Phylum", "Class", "Order", 
    "Family", "Genus")

aro_tax$ARO = gsub("ARO:", "", aro_tax$ARO)
# If superkingdom is empty, remove the line aro_tax
# <- aro_tax[-which(aro_tax$Superkingdom == ''), ]


# Try to group by both phylum and ARO and get mean
# distances
by_aro_tax <- aro_tax %>% group_by(Phylum, ARO)

# Make a summarized data frame.
df_aro_tax = by_aro_tax %>% summarise(mean_dist = mean(Mean_dist), 
    zeroes = sum(Mean_dist == 0), ratio = sum(Mean_dist > 
        0)/(sum(Mean_dist == 0) + sum(Mean_dist > 0)), 
    occurrences = n(), replicon_type = sum(Replicon == 
        "Plasmid")/(sum(Replicon == "Plasmid") + sum(Replicon == 
        "Chr")))

# make plot of phylums
p_phylum = ggplot(df_aro_tax, aes(x = reorder(Phylum, 
    -ratio, FUN = mean), ratio)) + geom_jitter(aes(size = occurrences, 
    color = replicon_type), alpha = 0.6) + geom_boxplot(alpha = 0.2, 
    fatten = NULL) + stat_summary(fun.y = mean, geom = "errorbar", 
    aes(ymax = ..y.., ymin = ..y..), width = 0.75, 
    size = 1, linetype = "solid") + scale_size_continuous(range = c(1, 
    3), guide = F) + theme_light() + theme(axis.text.x = element_text(angle = 45, 
    hjust = 1)) + labs(y = "IS ratio", color = "Replicon ratio", 
    size = "# CRLs", x = "Phylum") + ggtitle("A: Phyla") + 
    scale_color_continuous(high = "red", low = "blue", 
        guide = F)

# For performing statistical tests, remove phyla
# with less than 10 entries
df_aro_tax_filt = df_aro_tax %>% group_by(Phylum) %>% 
    filter(sum(occurrences) > 10)

# Is there normality in the data?
df_aro_tax_filt %>% group_by(Phylum) %>% filter(n() > 
    3) %>% shapiro_test(ratio)
## # A tibble: 5 x 4
##   Phylum         variable statistic        p
##   <fct>          <chr>        <dbl>    <dbl>
## 1 Actinobacteria ratio        0.631 2.64e-15
## 2 Bacteroidetes  ratio        0.665 4.30e- 9
## 3 Firmicutes     ratio        0.789 7.36e-17
## 4 Proteobacteria ratio        0.748 3.90e-34
## 5 Tenericutes    ratio        0.765 5.28e- 2
# There is not normality in the data

# Perform Wilcoxon Rank Sum test (Mann-Whitney)
phylum_test = as.data.frame(compare_means(ratio ~ Phylum, 
    data = df_aro_tax_filt, method = "wilcox.test", 
    paired = F, p.adjust.method = "none")) %>% filter(p.adj < 
    0.05)
comparisons_phylum = as.data.frame(cbind(phylum_test$group1, 
    phylum_test$group2))

phylum_test
##     .y.         group1         group2            p   p.adj p.format p.signif
## 1 ratio Actinobacteria     Firmicutes 1.070055e-04 1.1e-04  0.00011      ***
## 2 ratio Actinobacteria Proteobacteria 6.656067e-08 6.7e-08  6.7e-08     ****
## 3 ratio  Bacteroidetes     Firmicutes 3.991445e-03 4.0e-03  0.00399       **
## 4 ratio  Bacteroidetes Proteobacteria 1.703118e-04 1.7e-04  0.00017      ***
## 5 ratio     Firmicutes Proteobacteria 1.557417e-02 1.6e-02  0.01557        *
##     method
## 1 Wilcoxon
## 2 Wilcoxon
## 3 Wilcoxon
## 4 Wilcoxon
## 5 Wilcoxon
##### Subset bacteria

# Start with plotting firmicutes families
sub_bact <- by_aro_tax[which(by_aro_tax$Phylum == "Firmicutes"), 
    ]
sub_bact <- sub_bact %>% group_by(Family, ARO)
sub_df_aro_tax = sub_bact %>% summarise(mean_dist = mean(Mean_dist), 
    zeroes = sum(Mean_dist == 0), ratio = sum(Mean_dist > 
        0)/(sum(Mean_dist == 0) + sum(Mean_dist > 0)), 
    occurrences = n(), replicon_type = sum(Replicon == 
        "Plasmid")/(sum(Replicon == "Plasmid") + sum(Replicon == 
        "Chr")))

p_firmi = ggplot(sub_df_aro_tax, aes(x = reorder(Family, 
    -ratio, FUN = mean), ratio)) + geom_jitter(aes(size = occurrences, 
    color = replicon_type), alpha = 0.6) + geom_boxplot(alpha = 0.2) + 
    stat_summary(fun.y = mean, geom = "errorbar", aes(ymax = ..y.., 
        ymin = ..y..), width = 0.75, size = 1, linetype = "solid") + 
    scale_size_continuous(range = c(1, 3)) + theme_light() + 
    theme(axis.text.x = element_text(angle = 45, hjust = 1)) + 
    labs(y = "IS ratio", color = "Replicon ratio", 
        size = "# CRLs", x = "Family") + ggtitle("Firmicutes families") + 
    scale_color_continuous(high = "red", low = "blue", 
        guide = F)

p_firmi

# Plot proteobacterial phylum
sub_bact <- by_aro_tax[which(by_aro_tax$Phylum == "Proteobacteria"), 
    ]
# Other phyla can be selected with e.g.: sub_bact
# <-
# by_aro_tax[which(by_aro_tax$Phylum=='Actinobacteria'),
# ] sub_bact <-
# by_aro_tax[which(by_aro_tax$Phylum=='Bacteroidetes'),
# ]

# What percentage of total unclustered loci does
# proteobacteria constitute?
nrow(sub_bact)/nrow(by_aro_tax) * 100
## [1] 88.17861
sub_bact <- sub_bact %>% group_by(Family, ARO)

sub_df_aro_tax = sub_bact %>% summarise(mean_dist = mean(Mean_dist), 
    zeroes = sum(Mean_dist == 0), ratio = sum(Mean_dist > 
        0)/(sum(Mean_dist == 0) + sum(Mean_dist > 0)), 
    occurrences = n(), replicon_type = sum(Replicon == 
        "Plasmid")/(sum(Replicon == "Plasmid") + sum(Replicon == 
        "Chr")))



# Remmove ca. 400 low-abundance proteobacteria
# families out of 1605 total for testing
sub_df_aro_tax_filt = sub_df_aro_tax %>% group_by(Family) %>% 
    filter(sum(occurrences) > 500)

# Test the IS ratio
family_test = as.data.frame(compare_means(ratio ~ Family, 
    data = sub_df_aro_tax_filt, method = "wilcox.test", 
    paired = F, p.adjust.method = "fdr")) %>% filter(p.adj < 
    0.05)
datatable(family_test)
# Test the replicon ratio
family_test_p = as.data.frame(compare_means(replicon_type ~ 
    Family, data = sub_df_aro_tax_filt, method = "wilcox.test", 
    paired = F, p.adjust.method = "fdr")) %>% filter(p.adj < 
    0.05)
datatable(family_test_p)
p_proteo = ggplot(sub_df_aro_tax_filt, aes(x = reorder(Family, 
    -ratio, FUN = mean), ratio)) + geom_jitter(aes(size = occurrences, 
    color = replicon_type), alpha = 0.6) + geom_boxplot(alpha = 0.2, 
    fatten = NULL) + stat_summary(fun.y = mean, geom = "errorbar", 
    aes(ymax = ..y.., ymin = ..y..), width = 0.75, 
    size = 1, linetype = "solid") + scale_size_continuous(range = c(1, 
    3)) + theme_light() + theme(axis.text.x = element_text(angle = 45, 
    hjust = 1)) + labs(y = "IS ratio", color = "Replicon ratio", 
    size = "# CRLs", x = "Family") + ggtitle("B: Top 10 Proteobacteria families") + 
    scale_color_continuous(high = "red", low = "blue", 
        guide = F)

# p_proteo

replicon_proteo = ggplot(sub_df_aro_tax_filt, aes(x = reorder(Family, 
    -ratio, FUN = mean), replicon_type)) + geom_jitter(aes(size = occurrences, 
    color = ratio), alpha = 0.6) + geom_boxplot(alpha = 0.2, 
    fatten = NULL) + stat_summary(fun.y = mean, geom = "errorbar", 
    aes(ymax = ..y.., ymin = ..y..), width = 0.75, 
    size = 1, linetype = "solid") + scale_size_continuous(range = c(0.5, 
    6)) + theme_light() + theme(axis.text.x = element_text(angle = 45, 
    hjust = 1)) + labs(y = "Replicon ratio", color = "IS ratio", 
    size = "# CRLs", x = "Family") + ggtitle("Replicon ratio of top 10 Proteobacteria families") + 
    scale_color_continuous(high = "red", low = "blue")

replicon_proteo

# Enterobacteriaceae subset
sub_bact <- by_aro_tax[which(by_aro_tax$Family == "Enterobacteriaceae"), 
    ]
# Other subsets selected with: sub_bact <-
# by_aro_tax[which(by_aro_tax$Family=='Burkholderiaceae'),
# ] sub_bact <-
# by_aro_tax[which(by_aro_tax$Family=='Bacillaceae'),
# ] sub_bact <-
# by_aro_tax[which(by_aro_tax$Family=='Enterococcaceae'),
# ] sub_bact <-
# by_aro_tax[which(by_aro_tax$Family=='Staphylococcaceae'),
# ]

sub_bact <- sub_bact %>% group_by(Genus, ARO)

sub_df_aro_tax = sub_bact %>% summarise(mean_dist = mean(Mean_dist), 
    zeroes = sum(Mean_dist == 0), ratio = sum(Mean_dist > 
        0)/(sum(Mean_dist == 0) + sum(Mean_dist > 0)), 
    occurrences = n(), replicon_type = sum(Replicon == 
        "Plasmid")/(sum(Replicon == "Plasmid") + sum(Replicon == 
        "Chr")))

sub_df_aro_tax_filt2 = sub_df_aro_tax %>% group_by(Genus) %>% 
    filter(sum(occurrences) > 100)

genus_test_p = as.data.frame(compare_means(replicon_type ~ 
    Genus, data = sub_df_aro_tax_filt2, method = "wilcox.test", 
    paired = F, p.adjust.method = "fdr")) %>% filter(p.adj < 
    0.05) %>% select(2:3, 5, 7)

genus_test_p$Ratio = "Replicon"


for (i in 1:nrow(genus_test_p)) {
    if (genus_test_p$p.adj[i] < 1e-04) {
        genus_test_p$p.signif[i] = "****"
    } else if (genus_test_p$p.adj[i] < 0.001) {
        genus_test_p$p.signif[i] = "***"
    } else if (genus_test_p$p.adj[i] < 0.01) {
        genus_test_p$p.signif[i] = "**"
    } else if (genus_test_p$p.adj[i] < 0.05) {
        genus_test_p$p.signif[i] = "*"
    } else if (genus_test_p$p.adj[i] > 0.05) {
        genus_test_p$p.signif[i] = "ns"
    }
}


genus_test = as.data.frame(compare_means(ratio ~ Genus, 
    data = sub_df_aro_tax_filt2, method = "wilcox.test", 
    paired = F, p.adjust.method = "fdr")) %>% filter(p.adj < 
    0.05) %>% select(2:3, 5, 7)

genus_test$Ratio = "IS"



for (i in 1:nrow(genus_test)) {
    if (genus_test$p.adj[i] < 1e-04) {
        genus_test$p.signif[i] = "****"
    } else if (genus_test$p.adj[i] < 0.001) {
        genus_test$p.signif[i] = "***"
    } else if (genus_test$p.adj[i] < 0.01) {
        genus_test$p.signif[i] = "**"
    } else if (genus_test$p.adj[i] < 0.05) {
        genus_test$p.signif[i] = "*"
    } else if (genus_test$p.adj[i] > 0.05) {
        genus_test$p.signif[i] = "ns"
    }
}


# Print a table of the pairwise comparisons for
# Enterobacteriaceae genera for IS and replicon
# ratios
genus_test_both = full_join(genus_test, genus_test_p, 
    by = c("group1", "group2")) %>% select(1:4, 6:7)
colnames(genus_test_both) = c("Genus 1", "Genus 2", 
    "IS p", "IS sign.", "Replicon p", "Replicon sign.")

print("Mann-Whitney pairwise tests of IS and replicon ratios for Enterobacteriaceae genera")
## [1] "Mann-Whitney pairwise tests of IS and replicon ratios for Enterobacteriaceae genera"
# Print table with kable package
# kable(genus_test_both,longtable = T) %>%
# kable_styling(bootstrap_options = c('striped',
# 'hover', 'condensed'), latex_options =
# c('hold_position', 'repeat_header')) Print table
# with DT package
datatable(genus_test_both)
write.csv2(genus_test_both, "genus_test_both.csv")
p_genera1 = ggplot(sub_df_aro_tax, aes(x = reorder(Genus, 
    -ratio, FUN = mean), ratio)) + geom_jitter(aes(size = occurrences, 
    color = replicon_type), alpha = 0.6) + geom_boxplot(alpha = 0.2, 
    fatten = NULL) + stat_summary(fun.y = mean, geom = "errorbar", 
    aes(ymax = ..y.., ymin = ..y..), width = 0.75, 
    size = 1, linetype = "solid") + scale_size_continuous(range = c(1, 
    3)) + theme_light() + theme(axis.text.x = element_text(angle = 45, 
    hjust = 1)) + labs(y = "IS ratio", color = "Replicon ratio", 
    size = "# CRLs", x = "Genus") + ggtitle(expression("C: Top 15 " ~ 
    italic(Enterobacteriaceae) ~ " genera")) + scale_color_continuous(high = "red", 
    low = "blue", guide = F)

# p_genera1


p_genera2 = ggplot(sub_df_aro_tax_filt2, aes(x = reorder(Genus, 
    -ratio, FUN = mean), ratio)) + geom_jitter(aes(size = occurrences, 
    color = replicon_type), alpha = 0.6) + scale_size_continuous(guide = F) + 
    scale_color_continuous(high = "red", low = "blue") + 
    labs(color = "Replicon ratio") + theme(legend.position = "bottom")

legend <- cowplot::get_legend(p_genera2)
grid.arrange(p_phylum, p_proteo, p_genera1, legend, 
    layout_matrix = rbind(c(1, 2), c(3), c(4)), heights = c(6, 
        6, 1))

Collect different MOB parameters and derive the ARG-MOB scale from these

#What we need and from where
#IS_ratio         Main analyses
#Replicon_ratio   Main analyses
#Int_ratio        Integron finder. Get the total number of occurrences from dist_data and then number of integron occurrences from df_int
#Phylo spread     Main analyses/taxonomy data

#A new data frame is required with all this data. Rows are individual CRLs and values
#Import IS distance and replicon data again. Same data as above, but reimported to ensure that formatting is for analyses
dist_data=read.csv("ARG_IS_distances_mech.txt",sep = "\t",header=F)
colnames(dist_data) = c("Region","ARO","Updist","Downdist","IS_Up","IS_Down","Replicon",
                        "Sim_cutoff","Mean_dist","Cluster_size","Mechanism","Submechanism")
#dist_data_pass_sim = dist_data[which(dist_data$Mean_dist < dist_data$Sim_cutoff),]
#dist_data = dist_data_pass_sim
dist_data2 = dist_data
dist_data2$ARO = gsub("ARO:","",dist_data2$ARO)
all_aros = unique(as.vector(dist_data2$ARO))
df <- data.frame(ARO=character(),
                 Occurrences=numeric(), 
                 Ratio=numeric(), 
                 Mean_distance=numeric(),
                 SD_distance=numeric(),
                 Mechanism=character(),
                 Submechanism=character(),
                 Zero_IS=numeric(),
                 Replicon=character())

for (ARO in all_aros){
  temp <- dist_data2[which(dist_data2$ARO==ARO), ]
  Zero_IS = nrow(temp[which(temp$Mean_dist == 0),])
  number_nonzero = nrow(temp[which(temp$Mean_dist > 0),])
  Ratio = number_nonzero/nrow(temp)
  Occurrences = nrow(temp)
  temp_nonzero = temp[which(temp$Mean_dist > 0),]
  Mean_distance = mean(temp_nonzero$Mean_dist)
  SD_distance = sd(temp_nonzero$Mean_dist)
  Mechanism = unique(as.vector(temp$Mechanism))
  IS_Up = unique(as.vector(temp$IS_Up))
  IS_Down = unique(as.vector(temp$IS_Down))
  Submechanism = unique(as.vector(temp$Submechanism))
  p_rep = nrow(temp[which(temp$Replicon == "Plasmid"),])
  c_rep = nrow(temp[which(temp$Mean_dist == "Chr"),])
  Replicon = p_rep/nrow(temp)
  temp2 = as.data.frame(cbind(ARO,Occurrences,Ratio,Mean_distance,SD_distance,Mechanism,Submechanism,Zero_IS,Replicon))
  df = rbind(df,temp2)
}



df$Occurrences = as.numeric(as.character(df$Occurrences))
df$Mean_distance = as.numeric(as.character(df$Mean_distance))
df$SD_distance = as.numeric(as.character(df$SD_distance))
df$Ratio = as.numeric(as.character(df$Ratio))
df$Zero_IS = as.numeric(as.character(df$Zero_IS))
df$Replicon = as.numeric(as.character(df$Replicon))


#Import data on integrons
arg_int = read.csv("ARG_integrons_ISProx.txt", header = F, sep = "\t")
colnames(arg_int) = c("Region","ARO","Updist","Downdist","IS_up","IS_down",
                      "Replicon","Sim_cutoff","Mean_dist","Cluster_size","Mechanism","Submechanism")

arg_int$ARO = gsub("ARO:","",arg_int$ARO)
arg_int$Cluster_size = gsub("size_","",arg_int$Cluster_size)

int_all_aros = unique(as.vector(arg_int$ARO))
df_int <- data.frame(ARO=character(),
                     Occurrences=numeric(), 
                     Ratio=numeric(), 
                     Mean_distance=numeric(),
                     SD_distance=numeric(),
                     Mechanism=character(),
                     Submechanism=character(),
                     Zero_IS=numeric(),
                     Replicon=character(),
                     Total_occurrences=numeric())
#Summarise integron data per ARO
for (ARO in int_all_aros){
  temp <- arg_int[which(arg_int$ARO==ARO), ]
  Zero_IS = nrow(temp[which(temp$Mean_dist == 0),])
  number_nonzero = nrow(temp[which(temp$Mean_dist > 0),])
  Ratio = number_nonzero/nrow(temp)
  Occurrences = nrow(temp)
  temp_nonzero = temp[which(temp$Mean_dist > 0),]
  Mean_distance = mean(temp_nonzero$Mean_dist)
  SD_distance = sd(temp_nonzero$Mean_dist)
  Mechanism = unique(as.vector(temp$Mechanism))
  Submechanism = unique(as.vector(temp$Submechanism))
  p_rep = nrow(temp[which(temp$Replicon == "Plasmid"),])
  c_rep = nrow(temp[which(temp$Mean_dist == "Chr"),])
  Replicon = p_rep/nrow(temp)
  Total_occurrences = sum(as.numeric(as.character(temp$Cluster_size)))
  temp2 = as.data.frame(cbind(ARO,Occurrences,Ratio,Mean_distance,SD_distance,Mechanism,Submechanism,Zero_IS,Replicon,Total_occurrences))
  df_int = rbind(df_int,temp2)
  
}



df_int$Occurrences = as.numeric(as.character(df_int$Occurrences))
df_int$Mean_distance = as.numeric(as.character(df_int$Mean_distance))
df_int$SD_distance = as.numeric(as.character(df_int$SD_distance))
df_int$Ratio = as.numeric(as.character(df_int$Ratio))
df_int$Zero_IS = as.numeric(as.character(df_int$Zero_IS))
df_int$Replicon = as.numeric(as.character(df_int$Replicon))


df_int$Mechanism = gsub("antibiotic_efflux","Antibiotic efflux",df_int$Mechanism)
df_int$Mechanism = gsub("antibiotic_target_replacement","Antibiotic target replacement",df_int$Mechanism)
df_int$Mechanism = gsub("antibiotic_target_alteration","Antibiotic target alteration",df_int$Mechanism)
df_int$Mechanism = gsub("antibiotic_inactivation","Antibiotic inactivation",df_int$Mechanism)
df_int$Mechanism = gsub("reduced_permeability_to_antibiotic","Reduced permeability to antibiotic",df_int$Mechanism)
df_int$Mechanism = gsub("antibiotic_target_protection","Antibiotic target protection",df_int$Mechanism)
df_int$Submechanism = gsub("_"," ",df_int$Submechanism)


#Plot integron data
df_int2 = df_int %>%
  select(Mechanism,Submechanism,Occurrences,Total_occurrences) %>%
  melt() %>%
  group_by(Mechanism,Submechanism,variable) %>%
  summarise(Accum = sum(value))

#Make nicer names
df_int2$variable = gsub("Occurrences","CRL occurrences",df_int2$variable)
df_int2$variable = gsub("Total_occurrences","Total occurrences",df_int2$variable)

df_int2$Submechanism = gsub("beta-lactamase","b-l",df_int2$Submechanism)
df_int2$Submechanism = gsub("beta-lactams","b-lactams",df_int2$Submechanism)
df_int2$Submechanism = gsub("antibiotic","Ab.",df_int2$Submechanism)
df_int2$Submechanism = gsub("Antibiotic","Ab.",df_int2$Submechanism)
df_int2$Submechanism = gsub("chloramphenicol acetyltransferase","",df_int2$Submechanism)
df_int2$Submechanism = gsub("rifampin ADP-ribosyltransferase","",df_int2$Submechanism)
df_int2$Submechanism = gsub("transferase","trans.",df_int2$Submechanism)
df_int2$Submechanism = gsub("trimethoprim","trim.",df_int2$Submechanism)
df_int2$Submechanism = gsub("reductase","red.",df_int2$Submechanism)
df_int2$Submechanism = gsub("major facilitator superfamily \\(","",df_int2$Submechanism)
df_int2$Submechanism = gsub("small multidrug resistance \\(","",df_int2$Submechanism)
df_int2$Submechanism = gsub("resistance-nodulation-cell division \\(","",df_int2$Submechanism)
df_int2$Submechanism = gsub("ATP-binding cassette \\(","",df_int2$Submechanism)
df_int2$Submechanism = gsub("\\) ab. efflux pump","",df_int2$Submechanism)
df_int2$Submechanism = gsub("resistant","res.",df_int2$Submechanism)
df_int2$Submechanism = gsub("cassette ribosomal protection","cas. ribo. prot.",df_int2$Submechanism)
df_int2$Submechanism = gsub("protein","",df_int2$Submechanism)
df_int2$Submechanism = gsub("rifampicin","rif.",df_int2$Submechanism)
df_int2$Submechanism = gsub("permeability","perm.",df_int2$Submechanism)
df_int2$Submechanism = gsub("General Bacterial ","",df_int2$Submechanism)
df_int2$Mechanism = gsub("Antibiotic","Ab.",df_int2$Mechanism)
df_int2$Mechanism = gsub("antibiotic","Ab.",df_int2$Mechanism)
df_int2$Mechanism = gsub("permeability","perm.",df_int2$Mechanism)


df_int3 = df_int2 %>%
  group_by(Mechanism, variable) %>%
  select(Mechanism,variable,Accum) %>%
  summarise(
    Accum2 = sum(Accum)
  )

df_int3 = df_int3 %>%
  filter(variable == "CRL occurrences")

df_int3$Mechanism = gsub("_"," ",df_int3$Mechanism)

p_int = ggplot(df_int3, aes(x=reorder(Mechanism,desc(Accum2), FUN = sum), y = Accum2)) +
  geom_bar(stat = "identity", position = "dodge") +
  theme_light() +
  theme(axis.text.x = element_text(angle = 45, hjust =1,size=10)) +
  labs(x = "", fill = "", y = "Count") +
  ggtitle("A: Major mechanisms in integrons") 


df_int2 = df_int2 %>%
  filter(variable == "CRL occurrences") %>%
  filter(Accum > 10)

colours = c("Ab. efflux" = "#A6CEE3", "Ab. inactivation" = "#1F78B4", "Ab. target replacement" = "#FB9A99", "Ab. target protection" = "#33A02C")

p_int_sub = ggplot(df_int2, aes(x=reorder(Submechanism,desc(Accum), FUN = sum),Accum, color = reorder(Mechanism,desc(Accum), FUN = sum))) +
  geom_point(size = 3) +
  theme_light() +
  theme(axis.text.x = element_text(angle = 45, hjust =1, size = 10)) +
  labs(x = "", fill = "", y = "Count", color = "Mechanism") +
  ggtitle("B: Submechanisms in integrons (count > 10)") +
  scale_color_manual(values = colours,
                     labels = c("Ab. inactivation (AI)","Ab. target replacement (ATR)",
                                "Ab. efflux (AE)","Ab. target protection (ATP)"))


grid.arrange(p_int,p_int_sub, widths = c(1,2),ncol =2)

#Get a table of relative abundance of the top submechanisms in integrons
#df_int2 = df_int %>%
#  select(Mechanism,Submechanism,Occurrences,Total_occurrences) %>%
#  melt() %>%
#  group_by(Mechanism,Submechanism,variable) %>%
#  summarise(
#    Accum = sum(value))

#df_int4 = df_int2 %>% 
#  filter(variable == "Occurrences")

#df_int4$SubFreq = df_int4$Accum/sum(df_int4$Accum) *100

#df_int4_top = df_int4 %>%
#  group_by(Submechanism) %>%
#  arrange(desc(SubFreq)) %>%
#  select(Mechanism,Submechanism,Accum,SubFreq) %>%
#  `colnames<-`(c("Mechanism", "Submechanism", "CRL occurrences","Percentage of occurrences"))

#Get the submech counts from main analyses for comparison
df_int_join = df_int %>% left_join(df, by = "ARO") %>%
  select(ARO,Mechanism.x,Submechanism.x,Occurrences.x,Occurrences.y) %>%
  arrange(desc(Occurrences.x)) %>%
  `colnames<-`(c("ARO","Mechanism", "Submechanism", "Integron occurrences","Total occurrences"))
  
Gene = c("aadA","aadA2","AAC(6')-Ib-cr","dfrA12","OXA-1",
                  "dfrA14","aadA5","AAC(6')-Ib10","arr-3","sul1","dfrA1","qacH",
                  "catB3","cmlA1","AAC(6')-Ib9","ANT(3'')-IIa","OXA-9",
                  "ANT(2'')-Ia","AAC(6')-Ib7","cmlA5")
df_int_join = cbind(Gene,head(df_int_join,20))
df_int_join$'Percentage of integron occurrences' = df_int_join$`Integron occurrences`/sum(df_int_join$`Integron occurrences`) *100
df_int_join$'Percentage of total occurrences' = df_int_join$`Integron occurrences`/df_int_join$`Total occurrences` *100

datatable(df_int_join,)
#Table with kable
#kable(head(df_int_join,20)) %>%   kable_styling(bootstrap_options = c("striped", "hover", "condensed")) %>% column_spec(1,italic = TRUE)
#kable(head(df_int_join,20)) %>%   kable_styling(bootstrap_options = c("striped", "hover", "condensed")) %>% column_spec(1,italic = TRUE) %>%
#  as_image(file = "integron_top20_submechs.png")
#Export csv file of integron results (top20)
#write.csv2(df_int_join,file = "integron_top20_submechs.csv",sep = "\t",dec = ".")




#Merge ratios with MOB parameters for creating the ARG-MOB scale
all_aros = unique(as.vector(df$ARO))
dist_int_join = full_join(df, df_int, by = "ARO") %>%
  select(1,2,3,6,7,9,10,11,17)
colnames(dist_int_join) = c("ARO","Total_occurrences","IS_ratio",
                            "Mechanism","Submechanism","Replicon_ratio",
                            "Int_occurrences","Int_IS_ratio","Int_replicon_ratio")

#Merge integron data with IS distance and replicon data
dist_int_join$Int_ratio = dist_int_join$Int_occurrences/dist_int_join$Total_occurrences

#Import taxonomy data
aro_tax = read.csv("aro_tax.Rdat",sep="\t",header =F)

colnames(aro_tax) = c("Region","ARO","Up_dist","Down_dist","IS_up","IS_down",
                      "Replicon","Sim_cutoff","Mean_dist","Cluster_size","Mechanism","Submechanism","Acc.","Superkingdom",
                      "Kingdom","Phylum","Class","Order","Family","Genus")

aro_tax$ARO = gsub("ARO:","",aro_tax$ARO)
#If superkingdom is empty, remove the line
#aro_tax <- aro_tax[-which(aro_tax$Superkingdom == ""), ]


#Other tax parameters
df_aro_tax = as.data.frame(aro_tax %>% 
  group_by(ARO) %>%
  summarise(
    unique_genera = n_distinct(Family),
    n = n()))

df_aro_tax$Phylo_spread = 1-(1/df_aro_tax$unique_genera)


tax_all_aros = unique(as.vector(aro_tax$ARO))
df_tax <- data.frame(ARO=character(),
                     Diversity=numeric())

#If the below loop fails, it helps to restart R for some reason.
#Calculate Simpson diversity index per ARO
for (ARO in tax_all_aros){
  temp <- aro_tax[which(aro_tax$ARO==ARO), ]
  tax_count = as.data.frame(temp %>% group_by(Genus) %>% summarize(count=n()))
  rownames(tax_count) = tax_count$Genus
  unique_genera = n_distinct(tax_count$Genus)
  tax_count = subset(tax_count, select = count)
  taxcount = nrow(temp)
  Simpson_index = as.numeric(diversity(tax_count, index = "simpson"))
  Simpson2 = 1-(1/as.numeric(as.character(Simpson_index)))
  temp2 = as.data.frame(cbind(ARO,unique_genera,Simpson_index,taxcount,Simpson2))
  df_tax = rbind(df_tax,temp2)
}

df_tax$unique_genera = as.numeric(as.character(df_tax$unique_genera))
df_tax$Simpson_index = as.numeric(as.character(df_tax$Simpson_index))
df_tax$taxcount = as.numeric(as.character(df_tax$taxcount))
df_tax$Simpson2 = as.numeric(as.character(df_tax$Simpson2))

df_tax = unique(df_tax)

#Merge with other data
dist_int_tax_join = full_join(dist_int_join, df_tax, by = "ARO") %>%
  select(1,2,3,4,5,6,7,8,9,10,11,12,14)
dist_int_tax_join[is.na(dist_int_tax_join)] <- 0

dist_int_tax_join$ARGMOB = (dist_int_tax_join$IS_ratio + 
                                   dist_int_tax_join$Replicon_ratio + 
                                   dist_int_tax_join$Int_ratio + 
                                   dist_int_tax_join$Simpson_index)/4


#Rename mechanisms
dist_int_tax_join$Mechanism = gsub("antibiotic_efflux","Antibiotic efflux",dist_int_tax_join$Mechanism)
dist_int_tax_join$Mechanism = gsub("antibiotic_target_replacement","Antibiotic target replacement",dist_int_tax_join$Mechanism)
dist_int_tax_join$Mechanism = gsub("antibiotic_target_alteration","Antibiotic target alteration",dist_int_tax_join$Mechanism)
dist_int_tax_join$Mechanism = gsub("antibiotic_inactivation","Antibiotic inactivation",dist_int_tax_join$Mechanism)
dist_int_tax_join$Mechanism = gsub("reduced_permeability_to_antibiotic","Reduced permeability to antibiotic",dist_int_tax_join$Mechanism)
dist_int_tax_join$Mechanism = gsub("antibiotic_target_protection","Antibiotic target protection",dist_int_tax_join$Mechanism)


dist_int_tax_join_filt = dist_int_tax_join[which(dist_int_tax_join$Total_occurrences>10),]
dist_int_tax_join_filt = dist_int_tax_join %>%
  filter(Total_occurrences> 0) %>%
  filter(Mechanism != "Antibiotic efflux;Reduced permeability to antibiotic") %>%
  filter(Mechanism != "Antibiotic target alteration;Antibiotic efflux") %>%
  filter(Mechanism != "Reduced permeability to antibiotic") %>%
  filter(Mechanism != "Antibiotic target alteration;Antibiotic target replacement")



#Test for normality per mechanism and equal variance
qqp = ggqqplot(dist_int_tax_join_filt, x = "ARGMOB",color = "Mechanism")
ggpar(qqp, legend = "right")

dist_int_tax_join_filt %>% 
  select(Mechanism, ARGMOB) %>% # Remove unnecessary columns
  gather(key = "Mechanism", value = "ARGMOB") %>%
  group_by(Mechanism)  %>% 
  filter(n() > 3L) %>%
  do(broom::tidy(shapiro.test(.$ARGMOB)))  %>% 
  ungroup()  %>% 
  select(Mechanism, W = statistic, 'p-value' = p.value)
## # A tibble: 5 x 3
##   Mechanism                         W `p-value`
##   <chr>                         <dbl>     <dbl>
## 1 Antibiotic efflux             0.788  9.54e-17
## 2 Antibiotic inactivation       0.831  7.36e-27
## 3 Antibiotic target alteration  0.862  1.50e- 9
## 4 Antibiotic target protection  0.913  5.22e- 4
## 5 Antibiotic target replacement 0.950  8.38e- 2
#Fails. Data is not normally distributed 

#Test for homogeneity of variance #Fails. We cannot do T-test but instead Mann-Whitney test.
bartlett.test(ARGMOB ~ Mechanism, data = dist_int_tax_join_filt)
## 
##  Bartlett test of homogeneity of variances
## 
## data:  ARGMOB by Mechanism
## Bartlett's K-squared = 63.416, df = 4, p-value = 5.547e-13
#Make a function to test the different ratios for plotting
mw_custom_fun = function() {
  mech_means <- dist_int_tax_join_filt %>%
  group_by(Mechanism) %>%
  summarise(mean_ARGMOB = mean(ARGMOB))
  
  colnames(mech_means) = c("group1","mech_mean_ARGMOB")

  comp_means2 <- comp_means %>%
  full_join(mech_means, by = "group1") %>%
  drop_na(p) %>%
  arrange(desc(mech_mean_ARGMOB))

  colnames(mech_means) = c("group2","mech_mean_ARGMOB")
  
  comp_means2 <- comp_means2 %>%
  full_join(mech_means, by = "group2") %>%
  drop_na(p) %>%
  arrange(desc(mech_mean_ARGMOB.x),desc(mech_mean_ARGMOB.y))

  y_pos = seq(1.1, 1+nrow(comp_means2)*0.16, 0.16)
  comp_means2 = cbind(comp_means2,y_pos)
  
  comp_means2 = comp_means2 %>%
    mutate(p.signif = case_when(
      p.adj < 0.0001 ~ "****",
      p.adj < 0.001 ~ "***",
      p.adj < 0.01 ~ "**",
      p.adj < 0.05 ~ "*",
      p.adj > 0.05 ~ "ns"
    ))
}

comp_means = compare_means(ARGMOB ~ Mechanism, data = dist_int_tax_join_filt,
                           method = "wilcox.test",paired = F,
                           p.adjust.method = "fdr") %>%
    filter(p.adj < 0.05)

comp_means = mw_custom_fun()

datatable(comp_means)
#kable(comp_means) %>%
#  kable_styling(bootstrap_options = c("striped", "hover", "condensed"))

comp_means = compare_means(IS_ratio ~ Mechanism, data = dist_int_tax_join_filt,
                           method = "wilcox.test",paired = F,
                           p.adjust.method = "fdr") %>%
    filter(p.adj < 0.05)

comp_means = mw_custom_fun()

p1 = ggplot(dist_int_tax_join_filt, aes(x = reorder(Mechanism, -ARGMOB, FUN = mean,), ARGMOB)) + 
  geom_boxplot(outlier.shape =NA) +
  theme_light() +
  theme(axis.text.x=element_blank(),
        legend.position = "none",
        axis.title.x=element_blank()) +
  geom_point(aes(color = IS_ratio, size = Total_occurrences),
             position = position_jitterdodge(seed = 1,jitter.width = 0.7)) +
  scale_color_continuous(high = "red",low = "blue") +
  labs(title = "IS ratio", y = "ARG-MOB") +
  ggtitle("IS ratio") +
  scale_size_continuous(range = c(0.5,3)) +
  stat_summary(fun.y = mean, geom = "errorbar", aes(ymax = ..y.., ymin = ..y..),
               width = 0.75, size = 1, linetype = "dashed") +
  scale_y_continuous(labels= c(0,00.25,0.50,0.75,1.00), 
                     breaks= c(0,00.25,0.50,0.75,1.00), limits = c(0,1))  +
  annotate("text", x = 1:5, y = 1, label = c("bold(a)","bold(b)","bold(bc)","bold(c)","bold(c)"), 
           parse = T, size = 3)

#p1

p1_dens = ggplot(dist_int_tax_join_filt, aes(x = reorder(Mechanism, -ARGMOB, FUN = mean,), IS_ratio)) +
  geom_boxplot(outlier.alpha = 0.5,) +
  theme_light() +
  ggsignif::geom_signif(data=comp_means, aes(xmin=group1, xmax=group2, 
                                             annotations=p.signif, y_position=y_pos), manual=TRUE, tip_length = 0,vjust=0.5) +
  scale_y_continuous(labels= c(0,00.25,0.50,0.75,1.00), 
                     breaks= c(0,00.25,0.50,0.75,1.00), limits = c(0,max(comp_means$y_pos+0.2))) +
  theme(axis.text.x=element_blank(),
        legend.position = "none",
        axis.title.x=element_blank()) +
  labs(y = "IS ratio")
  
  
#Replicon ratio
comp_means = compare_means(Replicon_ratio ~ Mechanism, data = dist_int_tax_join_filt, 
                           method = "wilcox.test",paired = F,
                           p.adjust.method = "fdr") %>%
  filter(p.adj < 0.05)

comp_means = mw_custom_fun()

p2 = ggplot(dist_int_tax_join_filt, aes(x = reorder(Mechanism, -ARGMOB, FUN = mean,), ARGMOB)) + 
  geom_boxplot(outlier.shape =NA) +
  theme_light() +
  theme(axis.text.x=element_blank(),
        legend.position = "none",
        axis.title.x=element_blank()) +
  geom_point(aes(color = Replicon_ratio, size = Total_occurrences),
             position = position_jitterdodge(seed = 1,jitter.width = 0.7)) +
  scale_color_continuous(high = "red",low = "blue") +
  labs(title = "Replicon ratio", y = "ARG-MOB") +
  scale_size_continuous(range = c(0.5,3)) +
  stat_summary(fun.y = mean, geom = "errorbar", aes(ymax = ..y.., ymin = ..y..),
               width = 0.75, size = 1, linetype = "dashed") +
  scale_y_continuous(labels= c(0,00.25,0.50,0.75,1.00), 
                     breaks= c(0,00.25,0.50,0.75,1.00), limits = c(0,1)) +
  annotate("text", x = 1:5, y = 1, label = c("bold(a)","bold(b)","bold(bc)","bold(c)","bold(c)"), 
           parse = T, size = 3)


p2_dens = ggplot(dist_int_tax_join_filt, aes(x = reorder(Mechanism, -ARGMOB, FUN = mean,), Replicon_ratio)) +
  geom_boxplot(outlier.alpha = 0.5,) +
  theme_light() +
  ggsignif::geom_signif(data=comp_means, aes(xmin=group1, xmax=group2, 
                                             annotations=p.signif, y_position=y_pos), manual=TRUE, tip_length = 0,vjust=0.5) +
  scale_y_continuous(labels= c(0,00.25,0.50,0.75,1.00), 
                     breaks= c(0,00.25,0.50,0.75,1.00), limits = c(0,max(comp_means$y_pos+0.2))) +
  theme(axis.text.x=element_blank(),
        legend.position = "none",
        axis.title.x=element_blank()) +
  labs(y = "Replicon ratio")


#Integron ratio
comp_means = compare_means(Int_ratio ~ Mechanism, data = dist_int_tax_join_filt, 
                           method = "wilcox.test",paired = F,
                           p.adjust.method = "fdr") %>%
  filter(p.adj < 0.05)

comp_means = mw_custom_fun()

p3 = ggplot(dist_int_tax_join_filt, aes(x = reorder(Mechanism, -ARGMOB, FUN = mean,), ARGMOB)) + 
  geom_boxplot(outlier.shape =NA) +
  theme_light() +
  theme(#axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position = "none",
        axis.text.x=element_blank(),
        axis.title.x=element_blank()) +
  geom_point(aes(color = Int_ratio, size = Total_occurrences),
             position = position_jitterdodge(seed = 1,jitter.width = 0.7)) +
  scale_color_continuous(high = "red",low = "blue") +
  labs(title = "Integron ratio", y = "ARG-MOB")+
  scale_size_continuous(range = c(0.5,3))  +
  stat_summary(fun.y = mean, geom = "errorbar", aes(ymax = ..y.., ymin = ..y..),
               width = 0.75, size = 1, linetype = "dashed") +
  scale_y_continuous(labels= c(0,00.25,0.50,0.75,1.00), 
                     breaks= c(0,00.25,0.50,0.75,1.00), limits = c(0,1)) +
  annotate("text", x = 1:5, y = 1, label = c("bold(a)","bold(b)","bold(bc)","bold(c)","bold(c)"), 
           parse = T, size = 3)

p3_dens = ggplot(dist_int_tax_join_filt, aes(x = reorder(Mechanism, -ARGMOB, FUN = mean,), Int_ratio)) +
  geom_boxplot(outlier.alpha = 0.5,) +
  theme_light() +
  ggsignif::geom_signif(data=comp_means, aes(xmin=group1, xmax=group2, 
                                             annotations=p.signif, y_position=y_pos), manual=TRUE, tip_length = 0,vjust=0.5) +
  scale_y_continuous(labels= c(0,00.25,0.50,0.75,1.00), 
                     breaks= c(0,00.25,0.50,0.75,1.00), limits = c(0,max(comp_means$y_pos+0.2))) +
    theme(axis.text.x=element_blank(),
        legend.position = "none",
        axis.title.x=element_blank()) +
  labs(y = "Integron ratio")


#Simpson diversity index
comp_means = compare_means(Simpson_index ~ Mechanism, data = dist_int_tax_join_filt, 
                           method = "wilcox.test",paired = F,
                           p.adjust.method = "fdr") %>%
  filter(p.adj < 0.05) 

comp_means = mw_custom_fun()

p4 = ggplot(dist_int_tax_join_filt, aes(x = reorder(Mechanism, -ARGMOB, FUN = mean,), ARGMOB)) + 
  geom_boxplot(outlier.shape = NA) +
  theme_light() +
  theme(axis.text.x = element_text(angle = 60, hjust = 1, size=12),
        legend.position = "none",
        axis.title.x=element_blank()) +
  geom_point(aes(color = Simpson_index, size = Total_occurrences),
             position = position_jitterdodge(seed = 1,jitter.width = 0.7)) +
  scale_color_continuous(high = "red",low = "blue") +
  labs(title = "Simpson index", y = "ARG-MOB")+
  scale_size_continuous(range = c(0.5,3)) +
  stat_summary(fun.y = mean, geom = "errorbar", aes(ymax = ..y.., ymin = ..y..),
               width = 0.75, size = 1, linetype = "dashed") +
  scale_y_continuous(labels= c(0,00.25,0.50,0.75,1.00), 
                     breaks= c(0,00.25,0.50,0.75,1.00), limits = c(0,1)) +
  scale_x_discrete(labels=c("ATR","ATP","AI","ATA","AE")) +
  annotate("text", x = 1:5, y = 1, label = c("bold(a)","bold(b)","bold(bc)","bold(c)","bold(c)"), 
           parse = T, size = 3)

p4_dens = ggplot(dist_int_tax_join_filt, aes(x = reorder(Mechanism, -ARGMOB, FUN = mean,), Simpson_index)) +
  geom_boxplot(outlier.alpha = 0.5) +
  theme_light() +
  ggsignif::geom_signif(data=comp_means, aes(xmin=group1, xmax=group2, 
                                             annotations=p.signif, y_position=y_pos), manual=TRUE, tip_length = 0,vjust=0.5) +
  scale_y_continuous(labels= c(0,00.25,0.50,0.75,1.00), 
                     breaks= c(0,00.25,0.50,0.75,1.00), limits = c(0,max(comp_means$y_pos+0.2))) +
  theme(legend.position = "none",
        axis.title.x=element_blank(),
        axis.text.x = element_text(angle = 60, hjust = 1, size=12)) +
  scale_x_discrete(labels=c("ATR","ATP","AI","ATA","AE")) +
  labs(y = "Simpson index")


p5 = ggplot(dist_int_tax_join_filt, aes(x = reorder(Mechanism, -ARGMOB, FUN = mean,), ARGMOB)) + 
  geom_boxplot(outlier.shape =NA) +
  theme_light() +
  theme(axis.text.x = element_text(angle = 60, hjust = 1, size=12),
        legend.position = "none",
        axis.title.y=element_blank(),
        axis.title.x=element_blank()) +
  geom_point(aes(color = Exp_Ratio, size = Total_occurrences),
             position = position_jitterdodge(seed = 1,jitter.width = 0.7)) +
  scale_color_continuous(high = "red",low = "blue") +
  labs(title = "High mob")+
  scale_size_continuous(range = c(0.5,3)) +
  scale_y_continuous(labels= c(0,00.25,0.50,0.75,1.00),
                     breaks= c(0,00.25,0.50,0.75,1.00), limits = c(0,1))


# Get the legend only
px = ggplot(dist_int_tax_join_filt, aes(x = reorder(Mechanism, -ARGMOB, FUN = mean,), ARGMOB)) + 
  geom_boxplot(outlier.shape =NA) +
  theme_light() +
  geom_point(aes(color = Simpson_index, size = Total_occurrences),
             position = position_jitterdodge(seed = 1,jitter.width = 0.7)) +
  scale_color_continuous(high = "red",low = "blue") +
  labs(title = "Simpson index", color = "Ratios",size = "Number of\nCRLs")

legend <- cowplot::get_legend(px)
# Plots of figure a,b titles
p_a = ggplot() + ggtitle("a") + theme(plot.title = element_text(face = "bold"))
p_b = ggplot() + ggtitle("b") + theme(plot.title = element_text(face = "bold"))
p_a_title = cowplot::get_title(p_a)
p_b_title = cowplot::get_title(p_b)

grid.arrange(legend, p_a_title, p1, p2, p3, p4, p_b_title, 
    p1_dens, p2_dens, p3_dens, p4_dens, layout_matrix = cbind(c(1), 
        c(2, 3, 4, 5, 6), c(7, 8, 9, 10, 11)), widths = c(1, 
        5, 5), heights = c(0.2, 1, 1, 1, 1))

print("Replacement (ATR)\nProtection (ATP)\nInactivation (AI)\nAlteration (ATA)\nEfflux (AE)")
## [1] "Replacement (ATR)\nProtection (ATP)\nInactivation (AI)\nAlteration (ATA)\nEfflux (AE)"
# Make interactive plot ggplotly(p1)
mytext = paste("ARO = ", dist_int_tax_join_filt$ARO, 
    "\n", "ARG-MOB = ", dist_int_tax_join_filt$ARGMOB, 
    "\n", "Mechanism = ", dist_int_tax_join_filt$Mechanism, 
    "\n", "Submech, = ", dist_int_tax_join_filt$Submechanism, 
    "\n", "IS ratio = ", dist_int_tax_join_filt$IS_ratio, 
    "\n", "Replicon ratio = ", dist_int_tax_join_filt$Replicon_ratio, 
    "\n", "Integron ratio = ", dist_int_tax_join_filt$Int_ratio, 
    "\n", "Simpson index = ", dist_int_tax_join_filt$Simpson_index, 
    "\n")
pp = plotly_build(p1)
style(pp, hovertext = mytext, hoverinfo = "text")

Create Pearson correlation plots between ARG-MOB parameter

Pairwise Pearson correlation charts are made for all mechanisms combined and then for each mechanism independently.

A heatmap of ARG-MOB parameters is created using the most abundant CRLs (min 20 regions)

for_corr = cbind(dist_int_tax_join$IS_ratio, dist_int_tax_join$Replicon_ratio, 
    dist_int_tax_join$Int_ratio, dist_int_tax_join$Simpson_index)

colnames(for_corr) = c("IS ratio", "Replicon ratio", 
    "Int ratio", "Simpson index")
chart.Correlation(for_corr, histogram = TRUE, pch = 19)

# Do correlation chart per mechanism group (only
# major) antibiotic_efflux
# antibiotic_target_replacement
# antibiotic_target_alteration
# antibiotic_inactivation
# antibiotic_target_protection

corr_fun = function(input_df, mech, plotx) {
    sub_corr = input_df[which(input_df$Mechanism == 
        mech), ]
    sub_corr = cbind(sub_corr$IS_ratio, sub_corr$Replicon_ratio, 
        sub_corr$Int_ratio, sub_corr$Simpson_index)
    colnames(sub_corr) = c("IS ratio", "Replicon ratio", 
        "Int ratio", "Simpson index")
    plotx <- chart.Correlation(sub_corr, histogram = TRUE, 
        pch = 19)
}


# Antibiotic efflux
corr_fun(dist_int_tax_join, "Antibiotic efflux", p1)

# Antibiotic target replacement
corr_fun(dist_int_tax_join, "Antibiotic target replacement", 
    p2)

# Antibiotic target alteration
corr_fun(dist_int_tax_join, "Antibiotic target alteration", 
    p3)

# Antibiotic inactivation
corr_fun(dist_int_tax_join, "Antibiotic inactivation", 
    p4)

# Antibiotic target protection
corr_fun(dist_int_tax_join, "Antibiotic target protection", 
    p5)

# Make heatmap. Filter out CRLs that occur less
# than 20 times, in order to avoid a cluttered
# heatmap
test = dist_int_tax_join %>% filter(Total_occurrences > 
    20) %>% filter(Mechanism != "Antibiotic efflux;Reduced permeability to antibiotic") %>% 
    filter(Mechanism != "Antibiotic target alteration;Antibiotic efflux") %>% 
    filter(Mechanism != "Reduced permeability to antibiotic") %>% 
    filter(Mechanism != "Antibiotic target alteration;Antibiotic target replacement")


for_heat = cbind(test$IS_ratio, test$Replicon_ratio, 
    test$Int_ratio, test$Simpson_index)
colnames(for_heat) = c("IS ratio", "Replicon ratio", 
    "Integron ratio", "Simpson index")

for_heatmap = t(as.data.frame(for_heat))
mechanisms_ARGMOB = data.frame(Mechanism = test$Mechanism, 
    `ARG-MOB` = as.numeric(test$ARGMOB))
colnames(mechanisms_ARGMOB) = c("Mechanism", "ARG-MOB")
colours = c(`All mechanisms` = "black", `Antibiotic efflux` = "#A6CEE3", 
    `Antibiotic inactivation` = "#1F78B4", `Antibiotic target alteration` = "#B2DF8A", 
    `Antibiotic target protection` = "#33A02C", `Antibiotic target replacement` = "#FB9A99")

col = list(Mechanism = c(`Antibiotic efflux` = "#A6CEE3", 
    `Antibiotic target replacement` = "#FB9A99", `Antibiotic target alteration` = "#B2DF8A", 
    `Antibiotic inactivation` = "#1F78B4", `Antibiotic target protection` = "#33A02C"), 
    `ARG-MOB` = circlize::colorRamp2(c(0, 1), c("white", 
        "red")))


ha = HeatmapAnnotation(df = mechanisms_ARGMOB, col = col)
pheat = Heatmap(for_heatmap, top_annotation = ha, cluster_rows = F, 
    name = "Ratio", heatmap_height = 0.1, column_dend_height = unit(30, 
        "mm"), heatmap_legend_param = list(ncol = 1, 
        legend_direction = "vertical"), col = circlize::colorRamp2(c(0, 
        1), c("blue", "red")))
pheat

Investigate how ARG-MOB parameters overlap using barplots.

A table of the top 10 ARG-MOB AROs per major mechanism is also produced. This table is not included in main text or supplemental information.

# Investigate how many CRLs are on plasmids that
# are also in integrons and associated with IS
# elements... And other combinations of these.
# Make a dummy variable in the arg_int dataset
# indicating that the ARG is in an integron
arg_int2 = arg_int
arg_int2$Integron = "Integron"
# Join with dist_data2
dist_data2_int = full_join(dist_data2, arg_int2, by = "Region") %>% 
    select(1, 2, 7, 9, 11, 24)
## Warning: Column `Region` joining factors with different levels, coercing to
## character vector
dist_data2_int[is.na(dist_data2_int)] <- "No integron"

dist_data2_int$Mean_dist.x = gsub("\\b0\\b", "No IS", 
    dist_data2_int$Mean_dist.x)
dist_data2_int$Mean_dist.x = gsub("[0-9]{1,}", "IS", 
    dist_data2_int$Mean_dist.x)
dist_data2_int$Mean_dist.x = gsub("\\.IS", "", dist_data2_int$Mean_dist.x)

dist_data2_int2 = dist_data2_int %>% group_by(Replicon.x, 
    Mean_dist.x, Integron, Mechanism.x) %>% summarise(no = n())

dist_data2_int2$Integron <- factor(dist_data2_int2$Integron, 
    levels = c("No integron", "Integron"))

# Subset to only major mechanisms
dist_data2_int3 = dist_data2_int2[which(dist_data2_int2$Mechanism.x == 
    "antibiotic_efflux" | dist_data2_int2$Mechanism.x == 
    "antibiotic_inactivation" | dist_data2_int2$Mechanism.x == 
    "antibiotic_target_alteration" | dist_data2_int2$Mechanism.x == 
    "antibiotic_target_protection" | dist_data2_int2$Mechanism.x == 
    "antibiotic_target_replacement"), ]
# Rename mechanisms
dist_data2_int3$Mechanism.x = gsub("antibiotic_efflux", 
    "Antibiotic efflux (AE)", dist_data2_int3$Mechanism.x)
dist_data2_int3$Mechanism.x = gsub("antibiotic_target_replacement", 
    "Antibiotic target replacement (ATR)", dist_data2_int3$Mechanism.x)
dist_data2_int3$Mechanism.x = gsub("antibiotic_target_alteration", 
    "Antibiotic target alteration (ATA)", dist_data2_int3$Mechanism.x)
dist_data2_int3$Mechanism.x = gsub("antibiotic_inactivation", 
    "Antibiotic inactivation (AI)", dist_data2_int3$Mechanism.x)
dist_data2_int3$Mechanism.x = gsub("reduced_permeability_to_antibiotic", 
    "Reduced permeability to antibiotic (RPA)", dist_data2_int3$Mechanism.x)
dist_data2_int3$Mechanism.x = gsub("antibiotic_target_protection", 
    "Antibiotic target protection (ATP)", dist_data2_int3$Mechanism.x)

dist_data2_int3$Mean_dist.x <- factor(dist_data2_int3$Mean_dist.x, 
    levels = c("No IS", "IS"))
dist_data2_int3$Integron <- factor(dist_data2_int3$Integron, 
    levels = c("No integron", "Integron"))


ggplot(dist_data2_int3, aes(Replicon.x, no, fill = Mechanism.x)) + 
    geom_bar(stat = "identity", position = "dodge") + 
    facet_wrap(Mean_dist.x ~ Integron, scales = "free_y", 
        drop = F) + theme_light() + scale_fill_brewer(palette = "Set1") + 
    labs(x = "Replicon", y = "# CRLs", fill = "Mechanism")

dist_data2_int3$Mean_dist.x <- factor(dist_data2_int3$Mean_dist.x, 
    levels = c("IS", "No IS"))
ggplot(dist_data2_int3, aes(Replicon.x, no, fill = Mean_dist.x)) + 
    geom_bar(stat = "identity", position = "stack") + 
    facet_wrap(~Integron, drop = F) + theme_light() + 
    scale_fill_brewer(palette = "Set1") + labs(x = "Replicon", 
    y = "# CRLs", fill = "Mobilized by IS") + theme(legend.position = c(0.75, 
    0.8))

# Make a table of the top 10 ARG-MOB AROs per major
# mechanism. Not included in main manuscript or
# supplemental
ai_sub = unique(dist_int_tax_join_filt) %>% filter(Mechanism == 
    "Antibiotic inactivation") %>% arrange(desc(ARGMOB)) %>% 
    head(n = 10) %>% select(1, 2, 4, 5, 3, 6, 10, 12, 
    14)

ae_sub = unique(dist_int_tax_join_filt) %>% filter(Mechanism == 
    "Antibiotic efflux") %>% arrange(desc(ARGMOB)) %>% 
    head(n = 10) %>% select(1, 2, 4, 5, 3, 6, 10, 12, 
    14)


aa_sub = unique(dist_int_tax_join_filt) %>% filter(Mechanism == 
    "Antibiotic target alteration") %>% arrange(desc(ARGMOB)) %>% 
    head(n = 10) %>% select(1, 2, 4, 5, 3, 6, 10, 12, 
    14)

ap_sub = unique(dist_int_tax_join_filt) %>% filter(Mechanism == 
    "Antibiotic target protection") %>% arrange(desc(ARGMOB)) %>% 
    head(n = 10) %>% select(1, 2, 4, 5, 3, 6, 10, 12, 
    14)

ar_sub = unique(dist_int_tax_join_filt) %>% filter(Mechanism == 
    "Antibiotic target replacement") %>% arrange(desc(ARGMOB)) %>% 
    head(n = 10) %>% select(1, 2, 4, 5, 3, 6, 10, 12, 
    14)

all_mech_top10 = rbind(ai_sub, ae_sub, aa_sub, ap_sub, 
    ar_sub) %>% arrange(desc(ARGMOB))

# Merge with protein/nucleotide accession numbers
# from CARD
card_accs = read.csv("aro_acc.txt", sep = " ", header = F)
colnames(card_accs) = c("ARO", "Prot. acc.", "Nuc. acc.")
card_accs$ARO = as.character(card_accs$ARO)
card_accs$ARO = gsub("ARO:", "", card_accs$ARO)


colnames(all_mech_top10) = c("ARO", "CRL occurrences", 
    "Mechanism", "Submech.", "IS\nratio", "Replicon\nratio", 
    "Integron\nratio", "Simpson\nindex", "ARG-MOB")

all_mech_top10_acc = all_mech_top10 %>% left_join(card_accs, 
    by = "ARO")

all_mech_top10_acc$Mechanism = gsub("_", " ", all_mech_top10_acc$Mechanism)
all_mech_top10_acc$Submech. = gsub("_", " ", all_mech_top10_acc$Submech.)

colnames(all_mech_top10_acc) = c("ARO", "CRL occurrences", 
    "Mechanism", "Submech.", "IS\nratio", "Replicon\nratio", 
    "Integron\nratio", "Simpson\nindex", "ARG-MOB", 
    "Prot.acc", "Nuc.acc")
# Kable table kable(all_mech_top10_acc, digits =
# 2,longtable = T) %>% column_spec(2, width =
# '0.8cm') %>% column_spec(3, width = '2.5cm') %>%
# column_spec(4, width = '2.5cm') %>%
# column_spec(5:9, width = '0.6cm') %>%
# kable_styling(bootstrap_options = c('striped',
# 'hover', 'condensed'), latex_options =
# c('hold_position', 'repeat_header','scale_down'),
# font_size = 7)

datatable(all_mech_top10_acc) %>% formatRound(columns = 5:9, 
    digits = 3)
# Export as csv for inserting into word
# supplemental information
# colnames(all_mech_top10_acc) = c('ARO','CRL
# occurrences','Mechanism','Submech.','IS ratio',
# 'Replicon ratio','Integron ratio','Simpson
# index','ARG-MOB', 'Prot.acc','Nuc.acc')

# write.csv2(all_mech_top10_acc,'all_mech_top10_acc.csv',sep
# = '\t')

Defining ARG-MOB categories

Based on manual estimates from ARG-MOB densities, groups of low to high ARG-MOB are defined. First, the density of ARG-MOB is plotted for all categories

ggplot(dist_int_tax_join, aes(ARGMOB,stat(count))) +
  geom_density() +
  scale_x_continuous(breaks=seq(0,1,0.05)) +
  theme_light()

#ARGMOB categories
#Very low:  ARGMOB = 0
#Low:       ARGMOB > 0 < 0.175      
#Medium:    ARGMOB > 0.175 < 0.375  
#High:      ARGMOB > 0.375 < 0.675  
#Very high  ARGMOB > 0.675

#Find local minima
dens = density(dist_int_tax_join$ARGMOB)
#Adjust y coordinates by the number of observations. This is also done in the ggplot geom_density function
dens$y = dens$y * nrow(dist_int_tax_join)
densy=dens$y
densx=dens$x
low_minimum = densx[which(densy == min(densy[densx < 0.2 & densx > 0]))]
medium_minimum = densx[which(densy == min(densy[densx < 0.5 & densx > 0.2]))]

#Cutoff between high and very high is more tricky, as there is no minimum (valley), only a change in slope. Isolate two intervals of data and fit straight lines on the data. The cutoff is where the two line intersect.
high_interval = data.frame(cbind(densx[densx < 0.65 & densx > 0.55],densy[densx < 0.65 & densx > 0.55]))
colnames(high_interval) = c("x","y")
#Check that the range is reasonibly linear
plot(high_interval)

high_line = lm(y ~ x, high_interval)
high_coef = t(high_line$coefficients)
colnames(high_coef) = c("intercept","slope")

vhigh_interval = data.frame(cbind(densx[densx < 1 & densx > 0.7],densy[densx < 1 & densx > 0.7]))
colnames(vhigh_interval) = c("x","y")
#Check that the range is reasonibly linear
plot(vhigh_interval)

vhigh_line = lm(y ~ x, vhigh_interval)
vhigh_coef = t(vhigh_line$coefficients)
colnames(vhigh_coef) = c("intercept","slope")

#Where do the lines intersect? This is the cutoff between high and very high ARG-MOB
line_intersect <- rbind(coef(high_line),coef(vhigh_line))
high_minimum = c(-solve(cbind(line_intersect[,2],-1)) %*% line_intersect[,1])
high_minimum = high_minimum[1]

#vlow = 0
#low = 0.175
#medium = 0.385
#high = 0.75
#vhigh = 0.75

vlow = 0
low = low_minimum
medium = medium_minimum
high = high_minimum
vhigh = high_minimum


#Plot thresholds in vallyes and the high-very high intersection
ggplot(dist_int_tax_join, aes(ARGMOB,stat(count))) +
  geom_density() +
  scale_x_continuous(breaks=seq(0,1,0.05)) +
  theme_light() +
  annotate("rect",xmin=0,xmax=low, ymin=0,ymax=3700, fill = "#ABDDA4", alpha = 0.2) +
  annotate("text",x = low/2,y = 3800, label = "Low") +
  annotate("rect",xmin=low,xmax=medium, ymin=0,ymax=3700, fill = "#FFFFBF", alpha = 0.2) +
  annotate("text",x = 0.28,y = 3800, label = "Medium") +
  annotate("rect",xmin=medium,xmax=high, ymin=0,ymax=3700, fill = "#FDAE61", alpha = 0.2) +
  annotate("text",x = 0.53,y = 3800, label = "High") +
  annotate("rect",xmin=high,xmax=1, ymin=0,ymax=3700, fill = "#D7191C", alpha = 0.2) +
  annotate("text",x = 0.83,y = 3800, label = "Very high") +
  geom_abline(slope = high_coef[,2], intercept = high_coef[,1],color = "red", linetype = 2) +
  geom_abline(slope = vhigh_coef[,2], intercept = vhigh_coef[,1], color = "red",linetype = 2) +
  geom_vline(xintercept = high_minimum) +
  geom_vline(xintercept = medium_minimum) +
  geom_vline(xintercept = low_minimum) +
  labs(x = "ARG-MOB score", y = "Normalized count") +
  ggtitle("Smoothed kernel density estimate\nand ARG-MOB groupings") +
  theme(plot.margin = unit(c(5.5, 20, 5.5, 5.5), "points"))

#Make a prettier plot
#Subset to only major mechanisms
dist_int_tax_join2 = dist_int_tax_join %>% filter(Mechanism %in% c("Antibiotic efflux", 
                                                                   "Antibiotic inactivation",
                                                                   "Antibiotic target replacement",
                                                                   "Antibiotic target protection",
                                                                   "Antibiotic target alteration"))
dist_int_tax_join_cat = dist_int_tax_join %>% 
  mutate(ARGMOB_category = ifelse(ARGMOB == vlow, "Very low",
                                                    ifelse(ARGMOB > vlow & ARGMOB < low, "Low",
                                                           ifelse(ARGMOB >= low & ARGMOB < medium,"Medium",
                                                                  ifelse(ARGMOB >= medium & ARGMOB < high, "High",
                                                                         ifelse(ARGMOB >= high, "Very high",NA))))))
dist_int_tax_join_cat$ARGMOB_category <- factor(dist_int_tax_join_cat$ARGMOB_category,
                                              levels = c("Very high","High","Medium","Low","Very low"))

  colours = c("All mechanisms" = "black", "Antibiotic efflux" = "#A6CEE3", "Antibiotic inactivation" = "#1F78B4", "Antibiotic target alteration" = "#B2DF8A", "Antibiotic target protection" = "#33A02C", "Antibiotic target replacement" = "#FB9A99")

p_ARGMOB = ggplot(unique(dist_int_tax_join2), aes(ARGMOB,stat(count), color = Mechanism)) +
  geom_vline(xintercept = c(vlow,low,medium,high,vhigh)) +
  annotate("rect",xmin=0,xmax=low, ymin=0,ymax=Inf, fill = "#ABDDA4", alpha = 0.2) +
  annotate("text",x = low/2,y = 3800, label = "italic(Low)", parse =T) +
  annotate("rect",xmin=low,xmax=medium, ymin=0,ymax=Inf, fill = "#FFFFBF", alpha = 0.2) +
  annotate("text",x = 0.28,y = 3800, label = "italic(Medium)", parse =T) +
  annotate("rect",xmin=medium,xmax=high, ymin=0,ymax=Inf, fill = "#FDAE61", alpha = 0.2) +
  annotate("text",x = 0.53,y = 3800, label = "italic(High)", parse =T) +
  annotate("rect",xmin=high,xmax=1, ymin=0,ymax=Inf, fill = "#D7191C", alpha = 0.2) +
  annotate("text",x = 0.83,y = 3800, label = "Very high", parse =F, fontface = 'italic') +
  geom_density(size = 2) +
  geom_density(data = unique(dist_int_tax_join), 
               aes(ARGMOB,stat(count), colour = "All mechanisms"), 
               size = 2, linetype = 1) +
  scale_x_continuous(breaks=seq(0,1,0.05)) +
  theme_light() +
  #scale_color_brewer(palette = "Paired") +
  labs(x = "ARG-MOB",y="Count") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position = "bottom",
        plot.title = element_text(face = "bold")) +
  #ggtitle('A: Density of ARG-MOB per mechanism') +
  ggtitle("a") +
  guides(col = guide_legend(nrow = 3)) +
  scale_color_manual(values = colours)

dist_int_tax_join_cat2 = dist_int_tax_join_cat %>% 
  filter(Mechanism %in% c("Antibiotic efflux", 
                          "Antibiotic inactivation",
                          "Antibiotic target replacement",
                          "Antibiotic target protection",
                          "Antibiotic target alteration"))

dist_int_tax_join_cat2$Mechanism <- factor(dist_int_tax_join_cat2$Mechanism,levels = c("Antibiotic inactivation",
                                                                                     "Antibiotic efflux",
                                                                                     "Antibiotic target alteration",
                                                                                     "Antibiotic target protection",
                                                                                     "Antibiotic target replacement"))

p_ARGMOB2 = ggplot(unique(dist_int_tax_join_cat2), aes(Mechanism, fill = ARGMOB_category)) +
  geom_bar(stat = "count") +
  theme_light() +
  theme(legend.position = "bottom",
    #axis.text.x = element_text(angle = 45, hjust = 1),
    plot.title = element_text(face = "bold"),
    legend.text = element_text(face = "italic")) +
  scale_fill_brewer(palette = "Spectral") +
  labs(fill = "ARG-MOB category",y = "Count") +
  #ggtitle("B: Category count per mechanism")
  ggtitle("b") +
  scale_x_discrete(labels = function(x) str_wrap(x, width = 15))



#Difficult to format this figure properly in Rmarkdown
ggarrange(p_ARGMOB, p_ARGMOB2, widths = c(1,1.2))

ggplot(unique(dist_int_tax_join2), aes(ARGMOB,stat(count))) +
  geom_density(size = 2) +
  geom_vline(xintercept = c(vlow,low,medium,high,vhigh)) +
  annotate("text",x = low/2,y = 0, label = "Low", angle = 0,vjust=0) +
  annotate("text",x = 0.28,y = 0, label = "Medium", angle = 0,vjust=0) +
  annotate("text",x = 0.53,y = 0, label = "High", angle =0,vjust=0) +
  annotate("text",x = 0.83,y = 0, label = "Very high", angle = 0,vjust=0) +
  scale_x_continuous(breaks=seq(0,1,0.1)) +
  theme_light() +
  labs(x = "ARG-MOB",y="Count") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "bottom") +
  guides(col = guide_legend(nrow = 3)) +
  facet_wrap(~Mechanism, scales = "free_y")

Extract high and very high ARG-MOB AROs and plot them

# Plot only high and very high AROs
top_sub = unique(dist_int_tax_join_filt) %>% filter(ARGMOB > 
    medium) %>% group_by(Submechanism) %>% filter(n() > 
    1)

# How many AI are high and very high?
top_sub %>% filter(Mechanism == "Antibiotic inactivation") %>% 
    filter(ARGMOB > medium & ARGMOB < high) %>% ungroup() %>% 
    tally()
## # A tibble: 1 x 1
##       n
##   <int>
## 1   145
top_sub %>% filter(Mechanism == "Antibiotic inactivation") %>% 
    filter(ARGMOB > high) %>% ungroup() %>% tally()
## # A tibble: 1 x 1
##       n
##   <int>
## 1    53
# What is the percentage of ATR high and very high?
atr_num = as.numeric(unique(dist_int_tax_join) %>% 
    filter(Mechanism == "Antibiotic target replacement") %>% 
    tally())

atr_high = as.numeric(unique(dist_int_tax_join) %>% 
    filter(Mechanism == "Antibiotic target replacement") %>% 
    filter(ARGMOB > medium) %>% tally())
atr_high/atr_num * 100
## [1] 64.10256
# Get submechanisms that have at least two AROs in
# High or Very high categories.
high_subs = unique(top_sub$Submechanism)
high_subs_df = dist_int_tax_join_filt[dist_int_tax_join_filt$Submechanism %in% 
    high_subs, ]

top_sub = top_sub %>% mutate(ARO = reorder(Submechanism, 
    ARGMOB, mean))
high_subs_df = high_subs_df %>% group_by(Submechanism) %>% 
    mutate(ARO = reorder(Submechanism, ARGMOB, mean))
high_subs_df$Submechanism = gsub("_", " ", high_subs_df$Submechanism)
# vlow = 0 low = 0.175 medium = 0.385 high = 0.75
# vhigh = 0.75

vlow = 0
low = low_minimum
medium = medium_minimum
high = high_minimum
vhigh = high_minimum


colours = c(`Antibiotic efflux` = "#A6CEE3", `Antibiotic inactivation` = "#1F78B4", 
    `Antibiotic target alteration` = "#B2DF8A", `Antibiotic target protection` = "#33A02C", 
    `Antibiotic target replacement` = "#FB9A99")

# Shorten names for plotting
high_subs_df$Submechanism = gsub("beta-lactamase", 
    "b-l", high_subs_df$Submechanism)
high_subs_df$Submechanism = gsub("transferase", "trans.", 
    high_subs_df$Submechanism)
high_subs_df$Submechanism = gsub("reductase", "red.", 
    high_subs_df$Submechanism)
high_subs_df$Submechanism = gsub("resistant", "res.", 
    high_subs_df$Submechanism)
high_subs_df$Submechanism = gsub("resistance", "res.", 
    high_subs_df$Submechanism)
high_subs_df$Submechanism = gsub("dfr", "", high_subs_df$Submechanism)
high_subs_df$Submechanism = gsub("protein", "", high_subs_df$Submechanism)
high_subs_df$Submechanism = gsub("rRNA", "", high_subs_df$Submechanism)
high_subs_df$Submechanism = gsub("phosphoethanolamine", 
    "PEA", high_subs_df$Submechanism)
high_subs_df$Submechanism = gsub("major facilitator superfamily \\(MFS\\)", 
    "MFS", high_subs_df$Submechanism)
high_subs_df$Submechanism = gsub("\\(LNU\\)", "", high_subs_df$Submechanism)
high_subs_df$Submechanism = gsub("ribosomal protection", 
    "rib. prot.", high_subs_df$Submechanism)

# Submechanisms with High and Very high ARG-MOB
# AROs
p_med_high_aros = ggplot(high_subs_df, aes(x = reorder(Submechanism, 
    -ARGMOB, FUN = median, ), ARGMOB)) + geom_hline(yintercept = high, 
    linetype = "dashed") + geom_hline(yintercept = medium, 
    linetype = "dashed") + geom_hline(yintercept = low, 
    linetype = "dashed") + annotate("rect", xmin = -Inf, 
    xmax = Inf, ymin = high, ymax = 1, fill = "#D7191C", 
    alpha = 0.2) + annotate("rect", xmin = -Inf, xmax = Inf, 
    ymin = medium, ymax = high, fill = "#FDAE61", alpha = 0.2) + 
    annotate("rect", xmin = -Inf, xmax = Inf, ymin = low, 
        ymax = medium, fill = "#FFFFBF", alpha = 0.2) + 
    annotate("rect", xmin = -Inf, xmax = Inf, ymin = 0, 
        ymax = low, fill = "#ABDDA4", alpha = 0.2) + 
    geom_boxplot(outlier.alpha = 0) + theme_light() + 
    theme(axis.text.x = element_text(angle = 45, hjust = 1, 
        size = 7.5), axis.title.x = element_blank(), 
        legend.position = "bottom", legend.direction = "horizontal", 
        legend.margin = margin(), plot.title = element_text(face = "bold")) + 
    geom_jitter(aes(size = Total_occurrences, color = Mechanism), 
        alpha = 0.8) + labs(size = "CRL occurrences", 
    y = "ARG-MOB") + scale_color_manual(values = colours, 
    labels = c("Antibiotic efflux (AE)", "Antibiotic inactivation", 
        "Antibiotic target alteration (ATA)", "Antibiotic target protection (ATP)", 
        "Antibiotic target replacement (ATR)")) + # ggtitle('Submechanisms with High and Very high
# ARG-MOB AROs') +
ggtitle("c") + # guides(colour = guide_legend(override.aes =
# list(size=6),nrow=2,byrow=TRUE)) +
guides(colour = FALSE) + scale_y_continuous(breaks = seq(0, 
    1, 0.1)) + scale_x_discrete(labels = function(x) str_wrap(x, 
    width = 22))


# Composite plot
grid.arrange(p_ARGMOB, p_ARGMOB2, p_med_high_aros, 
    nrow = 2, layout_matrix = rbind(c(1, 2), c(3)), 
    heights = c(1, 1))

# Mean of the VIM beta-lactamases
vim_aros = dist_int_tax_join_filt %>% filter(Submechanism == 
    "VIM_beta-lactamase")
vim_aros$weighted_mean = vim_aros$Total_occurrences * 
    vim_aros$ARGMOB
vim_mean = sum(vim_aros$weighted_mean)/sum(vim_aros$Total_occurrences)

# Not used
weighted_mean_AROs = dist_int_tax_join_filt %>% filter(Total_occurrences > 
    1) %>% mutate(weighted_mean = Total_occurrences * 
    ARGMOB) %>% group_by(Submechanism) %>% summarise(`Weighted mean ARG-MOB` = sum(weighted_mean)/sum(Total_occurrences), 
    CRLs = sum(Total_occurrences), `Unweighted median ARG-MOB` = median(ARGMOB))
datatable(weighted_mean_AROs)
# Print an interactive table of high and very high
# ARG-MOB AROs. NOT USED IN MANUSCRIPT
top_sub = dist_int_tax_join_filt %>% filter(ARGMOB > 
    medium) %>% group_by(Submechanism) %>% filter(n() > 
    1)

top_sub_table = top_sub %>% unique() %>% select(1, 
    4, 5, 2, 3, 6, 10, 11, 12, 14)

colnames(top_sub_table) = c("ARO", "Mechanism", "Submechanism", 
    "CRLs", "IS ratio", "Replicon ratio", "Integron ratio", 
    "# unique genera", "Simpson index", "ARG-MOB")
datatable(top_sub_table) %>% formatRound(columns = c("IS ratio", 
    "Replicon ratio", "Integron ratio", "Simpson index", 
    "ARG-MOB"), digits = 3)

Some AROs have large variance from the mean IS and replicon ratios

AROs with large variance (e.g. oqxAB) are usually not mobilized but have been in many cases. Investigate these and find AROs with largest spread from mean.

##### Calculate which AROs have largest variance for IS
##### and Replicon ratios based on genera ####
aro_tax = read.csv("aro_tax.Rdat", sep = "\t", header = F)

colnames(aro_tax) = c("Region", "ARO", "Up_dist", "Down_dist", 
    "IS_up", "IS_down", "Replicon", "Sim_cutoff", "Mean_dist", 
    "Cluster_size", "Mechanism", "Submechanism", "Acc.", 
    "Superkingdom", "Kingdom", "Phylum", "Class", "Order", 
    "Family", "Genus")

aro_tax$Mechanism = gsub("antibiotic_efflux", "Antibiotic efflux", 
    aro_tax$Mechanism)
aro_tax$Mechanism = gsub("antibiotic_target_replacement", 
    "Antibiotic target replacement", aro_tax$Mechanism)
aro_tax$Mechanism = gsub("antibiotic_target_alteration", 
    "Antibiotic target alteration", aro_tax$Mechanism)
aro_tax$Mechanism = gsub("antibiotic_inactivation", 
    "Antibiotic inactivation", aro_tax$Mechanism)
aro_tax$Mechanism = gsub("reduced_permeability_to_antibiotic", 
    "Reduced permeability to antibiotic", aro_tax$Mechanism)
aro_tax$Mechanism = gsub("antibiotic_target_protection", 
    "Antibiotic target protection", aro_tax$Mechanism)


aro_tax$ARO = gsub("ARO:", "", aro_tax$ARO)
# If superkingdom is empty, remove the line aro_tax
# <- aro_tax[-which(aro_tax$Superkingdom == ''), ]
aros = unique(as.vector(aro_tax$ARO))

# Try to make a ratio of the IS spread and use it
# in the general classification
df_sd_tax <- data.frame(ARO = character())
df_sd_tax2 <- data.frame(ARO = character())
# If the below loop fails, it helps to restart R
# for some reason.

for (ARO in aros) {
    temp <- aro_tax[which(aro_tax$ARO == ARO), ]
    mech = unique(as.character(temp$Mechanism))
    temp2 = temp %>% group_by(Genus) %>% summarise(mean_dist = mean(Mean_dist), 
        zeroes = sum(Mean_dist == 0), plasmids = sum(Replicon == 
            "Plasmid"), chrs = sum(Replicon == "Chr"), 
        ratio = sum(Mean_dist > 0)/(sum(Mean_dist == 
            0) + sum(Mean_dist > 0)), occurrences = n(), 
        replicon_type = sum(Replicon == "Plasmid")/(sum(Replicon == 
            "Plasmid") + sum(Replicon == "Chr")))
    # Filter out ARO:Genus combinations that occur only
    # once (singletons). Many of these are likely not
    # correctly annotated genera
    temp2_2 = temp2[which(temp2$occurrences > 1), ]
    sum_occur = sum(temp2_2$occurrences)
    is_mean = (sum(temp2_2$occurrences) - sum(temp2_2$zeroes))/sum(temp2_2$occurrences)
    temp2_2$is_var = temp2_2$ratio - is_mean
    rep_mean = sum(temp2_2$plasmids)/sum(temp2_2$occurrences)
    temp2_2$rep_var = temp2_2$replicon_type - rep_mean
    # Filter out ARO:Genus combinations that occur only
    # once (singletons). Many of these are likely not
    # correctly annotated genera
    temp3 = temp2[which(temp2$occurrences > 1), ] %>% 
        summarise(I_Variance = var(ratio), I_SD = sd(ratio), 
            I_Mean = mean(ratio), R_Variance = var(replicon_type), 
            R_SD = sd(replicon_type), R_Mean = mean(replicon_type))
    temp4 = as.data.frame(cbind(ARO, mech, temp3))
    if (nrow(temp2_2) > 0) {
        temp5 = as.data.frame(cbind(ARO, mech, temp2_2))
        df_sd_tax2 = rbind(df_sd_tax2, temp5)
    } else {
        df_sd_tax2 = df_sd_tax2
    }
    df_sd_tax = rbind(df_sd_tax, temp4)
}



df_sd_tax2_3 = df_sd_tax2 %>% group_by(mech, Genus) %>% 
    summarise(Positive_spread = sum(is_var[is_var > 
        0]), Negative_spread = sum(is_var[is_var < 
        0]), Positive_spread_rep = sum(rep_var[rep_var > 
        0]), Negative_spread_rep = sum(rep_var[rep_var < 
        0])) %>% arrange(desc(Positive_spread)) %>% 
    filter(Positive_spread + abs(Negative_spread) > 
        1) %>% filter(mech != "Antibiotic efflux;reduced_permeability_to_antibiotic")

p1 = ggplot(df_sd_tax2_3) + geom_bar(aes(x = reorder(Genus, 
    Positive_spread, FUN = sum), y = Positive_spread, 
    fill = mech), stat = "identity") + geom_bar(aes(x = reorder(Genus, 
    Positive_spread, FUN = sum), y = Negative_spread, 
    fill = mech), stat = "identity") + coord_flip() + 
    theme_light() + geom_hline(yintercept = 0) + scale_fill_brewer(palette = "Paired") + 
    scale_x_discrete(position = "top") + labs(y = "Summed IS mob. spread", 
    fill = "Mechanism") + theme(legend.position = "none", 
    axis.title.y = element_blank())


p2 = ggplot(df_sd_tax2_3) + geom_bar(aes(x = reorder(Genus, 
    Positive_spread, FUN = sum), y = Positive_spread_rep, 
    fill = mech), stat = "identity") + geom_bar(aes(x = reorder(Genus, 
    Positive_spread, FUN = sum), y = Negative_spread_rep, 
    fill = mech), stat = "identity") + coord_flip() + 
    theme_light() + geom_hline(yintercept = 0) + scale_fill_brewer(palette = "Paired") + 
    theme(legend.position = "none", axis.title.y = element_blank(), 
        axis.text.y = element_blank()) + labs(y = "Summed Rep. mob. spread")

p_legend = ggplot(df_sd_tax2_3) + geom_bar(aes(x = reorder(Genus, 
    Positive_spread, FUN = sum), y = Positive_spread, 
    fill = mech), stat = "identity") + scale_fill_brewer(palette = "Paired") + 
    labs(y = "Summed IS mob. spread", fill = "Mechanism") + 
    theme(legend.position = "bottom") + guides(fill = guide_legend(nrow = 2))

legend <- cowplot::get_legend(p_legend)

grid.arrange(legend, p1, p2, nrow = 2, layout_matrix = rbind(c(2, 
    3), c(1)), heights = c(10, 1), top = textGrob("Summed spread from means of ratios per ARO grouped by genus", 
    gp = gpar(fontsize = 14, font = 1)))

# Group by ARO only to find the most diverging ARGs
df_sd_tax2_4 = df_sd_tax2 %>% group_by(ARO, mech) %>% 
    summarise(Positive_spread = sum(is_var[is_var > 
        0]), Negative_spread = sum(is_var[is_var < 
        0]), Positive_spread_rep = sum(rep_var[rep_var > 
        0]), Negative_spread_rep = sum(rep_var[rep_var < 
        0]), test_rat = sum(ratio)/sum(abs(is_var))) %>% 
    arrange(desc(Positive_spread)) %>% filter(Positive_spread + 
    abs(Negative_spread) > 1) %>% filter(mech != "Antibiotic efflux;Reduced permeability to antibiotic") %>% 
    filter(mech != "Antibiotic target alteration;Antibiotic target replacement") %>% 
    filter(mech != "Antibiotic efflux;reduced_permeability_to_antibiotic")


df_sd_tax2_5 = melt(df_sd_tax2_4, id.vars = c("ARO", 
    "mech"), measure.vars = c("Positive_spread", "Negative_spread", 
    "Positive_spread_rep", "Negative_spread_rep"))

p5 = ggplot(df_sd_tax2_5) + geom_bar(aes(x = reorder(ARO, 
    value, FUN = sum), y = value, fill = variable), 
    stat = "identity") + coord_flip() + theme_light() + 
    geom_hline(yintercept = 0) + scale_fill_manual(values = c("red", 
    "red", "blue", "blue")) + scale_x_discrete(position = "top") + 
    labs(y = "Summed IS (blue) and Replicon (red) spread", 
        fill = "Parameter", x = "AROs") + theme(legend.position = "non") + 
    facet_wrap(~mech, scales = "free_y", nrow = 1)
p5

# Show oqxA example as table
oqxA_table = df_sd_tax2 %>% filter(ARO == "3003922") %>% 
    select("Genus", "occurrences", "chrs", "plasmids", 
        "zeroes", "ratio", "replicon_type") %>% arrange(desc(occurrences))
colnames(oqxA_table) = c("# Identified in total", "# Found on chromosome", 
    "# Found on plasmid", "#No IS association", "IS ratio", 
    "Replicon ratio")
oqxA_table
##              # Identified in total # Found on chromosome # Found on plasmid
## 1                       Klebsiella                   458                457
## 2                     Enterobacter                   124                123
## 3                      Escherichia                    36                  2
## 4                       Salmonella                    20                  3
## 5                       Raoultella                    14                 14
## 6                      Citrobacter                     8                  8
## 7                        Kosakonia                     7                  7
## 8                       Lelliottia                     6                  6
## 9                          Cedecea                     5                  5
## 10                     Phytobacter                     2                  2
## 11 unclassified_Enterobacteriaceae                     2                  2
##    #No IS association IS ratio Replicon ratio          NA
## 1                   1      387     0.15502183 0.002183406
## 2                   1      111     0.10483871 0.008064516
## 3                  34        0     1.00000000 0.944444444
## 4                  17        0     1.00000000 0.850000000
## 5                   0       13     0.07142857 0.000000000
## 6                   0        8     0.00000000 0.000000000
## 7                   0        4     0.42857143 0.000000000
## 8                   0        6     0.00000000 0.000000000
## 9                   0        3     0.40000000 0.000000000
## 10                  0        2     0.00000000 0.000000000
## 11                  0        2     0.00000000 0.000000000

A sheet with all relevant information for all evaluated 1176 AROs is exported. This table is provided as supplemental information and can be used as a lookup table for finding highly mobilized AROs (or AROs with other relevant traits)

# Add the spreads to the other data for exporting
# as excel sheet
df_sd_tax2_6 = df_sd_tax2 %>% group_by(ARO, mech) %>% 
    summarise(Positive_spread = sum(is_var[is_var > 
        0]), Negative_spread = sum(is_var[is_var < 
        0]), Positive_spread_rep = sum(rep_var[rep_var > 
        0]), Negative_spread_rep = sum(rep_var[rep_var < 
        0]), test_rat = sum(ratio)/sum(abs(is_var))) %>% 
    arrange(desc(Positive_spread))

tbl_export = df_sd_tax2_6 %>% full_join(dist_int_tax_join, 
    by = "ARO") %>% select("ARO", "Mechanism", "Submechanism", 
    "Total_occurrences", "IS_ratio", "Replicon_ratio", 
    "Int_ratio", "Simpson_index", "ARGMOB", "unique_genera", 
    "Int_occurrences", "Int_IS_ratio", "Int_replicon_ratio", 
    "Positive_spread", "Negative_spread", "Positive_spread_rep", 
    "Negative_spread_rep")

# Join with df from main analysis - requires that
# main analysis has been run

# First remake df to make sure the format is
# correct
dist_data = read.csv("ARG_IS_distances_mech.txt", sep = "\t", 
    header = F)
colnames(dist_data) = c("Region", "ARO", "Updist", 
    "Downdist", "IS_Up", "IS_Down", "Replicon", "Sim_cutoff", 
    "Mean_dist", "Cluster_size", "Mechanism", "Submechanism")
dist_data$Cluster_size = gsub("size_", "", dist_data$Cluster_size)

# For each region with an ARG, there can be up to
# two closest IS element: upstream and downstream.
# The distance to these are denoted as Updist and
# Downdist, while the Mean_dist column is the mean
# of the two. A distance of 0 indicates that no IS
# element was found.  Cluster_size refers to how
# many individual regions were clustered to form a
# given CRL Region.


dist_data2 = dist_data
dist_data2$ARO = gsub("ARO:", "", dist_data2$ARO)

# Make a new dataframe where various metric are
# calculated and summarized

all_aros = unique(as.vector(dist_data2$ARO))
df <- data.frame(ARO = character(), Occurrences = numeric(), 
    Ratio = numeric(), Mean_distance = numeric(), SD_distance = numeric(), 
    Mechanism = character(), Submechanism = character(), 
    Zero_IS = numeric(), Replicon = character(), Total_seqs = numeric())

for (ARO in all_aros) {
    temp <- dist_data2[which(dist_data2$ARO == ARO), 
        ]
    Zero_IS = nrow(temp[which(temp$Mean_dist == 0), 
        ])
    number_nonzero = nrow(temp[which(temp$Mean_dist > 
        0), ])
    Ratio = number_nonzero/nrow(temp)
    Occurrences = nrow(temp)
    temp_nonzero = temp[which(temp$Mean_dist > 0), 
        ]
    Mean_distance = mean(temp_nonzero$Mean_dist)
    SD_distance = sd(temp_nonzero$Mean_dist)
    Mechanism = unique(as.vector(temp$Mechanism))
    IS_Up = unique(as.vector(temp$IS_Up))
    IS_Down = unique(as.vector(temp$IS_Down))
    Submechanism = unique(as.vector(temp$Submechanism))
    p_rep = nrow(temp[which(temp$Replicon == "Plasmid"), 
        ])
    c_rep = nrow(temp[which(temp$Mean_dist == "Chr"), 
        ])
    Replicon = p_rep/nrow(temp)
    Total_seqs = sum(as.numeric(temp$Cluster_size))
    temp2 = as.data.frame(cbind(ARO, Occurrences, Ratio, 
        Mean_distance, SD_distance, Mechanism, Submechanism, 
        Zero_IS, Replicon, Total_seqs))
    df = rbind(df, temp2)
}


# Make sure that values in columns have a correct
# (numeric) format
df$Occurrences = as.numeric(as.character(df$Occurrences))
df$Mean_distance = as.numeric(as.character(df$Mean_distance))
df$SD_distance = as.numeric(as.character(df$SD_distance))
df$Ratio = as.numeric(as.character(df$Ratio))
df$Zero_IS = as.numeric(as.character(df$Zero_IS))
df$Replicon = as.numeric(as.character(df$Replicon))
df$Total_seqs = as.numeric(as.character(df$Total_seqs))



tbl_export2 = tbl_export %>% full_join(df, by = "ARO") %>% 
    select(1:4, "Total_seqs", 5:17) %>% full_join(card_accs, 
    by = "ARO") %>% unique()

colnames(tbl_export2) = c("ARO", "Mechanism", "Submechanism", 
    "CRLs", "Total unclustered regions", "IS ratio", 
    "Replicon ratio", "Integron ratio", "Simpson index", 
    "ARG-MOB", "Occurs in # genera", "# integron cases", 
    "Ratio of integron cases with IS elements", "Ratio of integron cases on plasmids", 
    "Summed positive spread from mean of IS ratio", 
    "Summed negative spread from mean of IS ratio", 
    "Summed positive spread from mean of replicon ratio", 
    "Summed negative spread from mean of replicon ratio", 
    "Example protein accession", "Example nucleotide accession")
datatable(tbl_export2, rownames = FALSE, filter = "top", 
    options = list(pageLength = 10, scrollX = T)) %>% 
    formatRound(columns = 6:18, digits = 2)
# columnDefs = list(list()))) %>%
# write.csv2(tbl_export2,file =
# 'ARG_MOB_data.csv',sep = '\t',dec = '.')

Make table and simple plot of Mechanism, CRL, and ARO counts

# Import the overview of mechanism count, generated
# in bash script
loci_count = read.csv("passing_regions_mech", header = F, 
    sep = " ")
colnames(loci_count) = c("Loci", "Mechanism")
crl_count = dist_data2 %>% group_by(Mechanism) %>% 
    tally()
colnames(crl_count) = c("Mechanism", "CRLs")
aro_count = df %>% group_by(Mechanism) %>% tally()
colnames(aro_count) = c("Mechanism", "AROs")

# Merge the three count tables
three_counts = full_join(loci_count, crl_count, by = "Mechanism") %>% 
    full_join(aro_count, by = "Mechanism") %>% select(2, 
    1, 3, 4) %>% arrange(desc(Loci))
three_counts$Mechanism = gsub("antibiotic_efflux", 
    "Antibiotic efflux (AE)", three_counts$Mechanism)
three_counts$Mechanism = gsub("antibiotic_target_replacement", 
    "Antibiotic target replacement (ATR)", three_counts$Mechanism)
three_counts$Mechanism = gsub("antibiotic_target_alteration", 
    "Antibiotic target alteration (ATA)", three_counts$Mechanism)
three_counts$Mechanism = gsub("antibiotic_inactivation", 
    "Antibiotic inactivation (AI)", three_counts$Mechanism)
three_counts$Mechanism = gsub("reduced_permeability_to_antibiotic", 
    "Reduced permeability to antibiotic (RPA)", three_counts$Mechanism)
three_counts$Mechanism = gsub("antibiotic_target_protection", 
    "Antibiotic target protection (ATP)", three_counts$Mechanism)
three_counts$Mechanism = gsub(";", ";\n", three_counts$Mechanism)

# Make a table for under the plot
tt <- ttheme_default(core = list(bg_params = list(fill = c("#A6CEE3", 
    "#1F78B4", "#B2DF8A", "#33A02C", "#FB9A99", "#E31A1C", 
    "#FDBF6F", "#FF7F00", "#CAB2D6")), fg_params = list(hjust = 0, 
    x = 0.1)), rowhead = list(fg_params = list(hjust = 0, 
    x = 0)), padding = unit(c(10, 4), "mm"), base_size = 10, 
    )

tbl <- tableGrob(three_counts, rows = NULL, theme = tt)
counts_plot = ggplot(three_counts, aes(Loci, AROs, 
    size = CRLs, color = reorder(Mechanism, desc(Loci), 
        FUN = sum))) + geom_point() + theme_light() + 
    theme(axis.text.x = element_text(angle = 45, hjust = 1), 
        strip.text.x = element_text(angle = 45), legend.position = "none") + 
    scale_color_brewer(palette = "Paired")

grid.arrange(counts_plot, tbl, nrow = 1, widths = c(3, 
    5))